Patches Diff¶

Patches are not always needed to export a LLM. Most of the time, only serialization function are needed to export a LLM with cache (DynamicCache, …). Function register_additional_serialization_functions is enough in many cases.

import torch
from onnx_diagnostic.torch_export_patches import register_additional_serialization_functions

with register_additional_serialization_functions(patch_transformers=True):
    ep = torch.export.export(...)

Function torch_export_patches helps fixing some issues for many models.

import torch
from onnx_diagnostic.torch_export_patches import torch_export_patches

with torch_export_patches(patch_transformers=True):
    ep = torch.export.export(...)

Class PatchDetails gives an example on how to retrieve the list of involded patches for a specific model. Those patches belongs to the following list which depends on transformers and pytorch versions.

<<<

import torch
import transformers

print(torch.__version__, transformers.__version__)

>>>

    2.10.0.dev20251113+cu130 5.0.0.dev0

Those two versions leads to the following list of patches.

<<<

from onnx_diagnostic.torch_export_patches.patch_details import PatchDetails
from onnx_diagnostic.torch_export_patches import torch_export_patches

details = PatchDetails()
with torch_export_patches(
    patch_transformers=True,
    patch_torch=True,
    patch_diffusers=True,
    patch_details=details,
):
    pass
for patch in details.patched:
    if patch.function_to_patch == patch.patch:
        continue
    rst = patch.format_diff(format="rst")
    print()
    print()
    print(rst)
    print()
    print()

>>>

sympy: ‘sympy.core.numbers.IntegerConstant.name’ -> _patch_sympy.<locals>.<lambda>¶

1sympy.core.numbers.IntegerConstant.name = lambda self: f"IntCst{str(self)}"

torch: infer_size -> patched_infer_size¶

--- original
+++ rewritten
@@ -1,4 +1,5 @@
-def infer_size(a, b):
+def patched_infer_size(a, b):
+    """Patches ``torch._subclasses.fake_impls.infer_size``."""
     from torch.fx.experimental.symbolic_shapes import guard_or_false

     dimsA = len(a)
@@ -23,11 +24,21 @@
         # expression of an or statement as-is, without bool()'ing it; if this
         # were not the case, we'd need to write this using torch.sym_or() or
         # something like that).
-        torch._check(
-            guard_or_false(sizeA == 1) or guard_or_false(sizeB == 1) or sizeA == sizeB,
-            lambda: f"The size of tensor a ({sizeA}) "
-            f"must match the size of tensor b ({sizeB}) "
-            f"at non-singleton dimension {i})",
-        )
-        expandedSizes[i] = sizeB if guard_or_false(sizeA == 1) else sizeA
+        try:
+            b1 = guard_or_false(sizeA == 1)
+        except torch.fx.experimental.symbolic_shapes.GuardOnDataDependentSymNode:
+            b1 = False
+        try:
+            b2 = guard_or_false(sizeB == 1)
+        except torch.fx.experimental.symbolic_shapes.GuardOnDataDependentSymNode:
+            b2 = False
+        try:
+            b3 = guard_or_false(sizeA == sizeB)
+        except torch.fx.experimental.symbolic_shapes.GuardOnDataDependentSymNode:
+            b3 = False
+        if b1 or b2 or b3:
+            expandedSizes[i] = sizeB if guard_or_false(sizeA == 1) else sizeA
+        else:
+            # PATCHED: generic case, the dimension is known, no need to assert
+            expandedSizes[i] = torch.sym_max(sizeA, sizeB)
     return tuple(expandedSizes)

torch: _broadcast_shapes -> patched__broadcast_shapes¶

--- original
+++ rewritten
@@ -1,11 +1,11 @@
-def _broadcast_shapes(*_shapes):
+def patched__broadcast_shapes(*_shapes):
+    """Patches ``torch._refs._broadcast_shapes``."""
+    from functools import reduce
+    from torch._prims_common import IntLike
     from torch.fx.experimental.symbolic_shapes import (
         guard_or_false,
         is_nested_int,
-        size_hint,
     )
-
-    backed_so = torch.fx.experimental._config.backed_size_oblivious

     shapes = tuple(
         (x,) if isinstance(x, IntLike) else x for x in filter(lambda x: x is not None, _shapes)
@@ -18,17 +18,15 @@
     for shape in shapes:
         if not isinstance(shape, Sequence):
             raise RuntimeError(
-                "Input shapes should be of type ints, a tuple of ints, or a list of ints, got ",
+                "Input shapes should be of type ints, a tuple of ints, "
+                "or a list of ints, got ",
                 shape,
             )

     # Computes common shape
-    common_shape: list[Union[int, torch.SymInt]] = [
-        1,
-    ] * reduce(max, (len(shape) for shape in shapes))
-    for arg_idx, shape in enumerate(shapes):
+    common_shape = [1] * reduce(max, (len(shape) for shape in shapes))
+    for _arg_idx, shape in enumerate(shapes):
         for idx in range(-1, -1 - len(shape), -1):
-            # NB: handle nested ints specially to avoid invalid guarding on Ne(j0, 1).
             if is_nested_int(shape[idx]):
                 # Broadcasting is allowed for (j0, 1) or (j0, j0);
                 # not (j0, j1), (j0, 5), etc.
@@ -37,40 +35,17 @@
                 ):
                     continue
             else:
-                # When backed size oblivious is used, we specialize for broadcasting
-                # if its the only way to compile the example input.
-                # i.e: s0:1, s1:1 ==>
-                #           assert s0==s1, no specialization on ==1 or !=1.
-                #            The non-broadcast path is picked
-                #      s0:1, s1:4 ==>
-                #           specialize(s0) to be 1.
-                #      s0:4, s1:1 ==>
-                #           specialize(s1) to be 1.
-                if backed_so:
-                    a = size_hint(shape[idx], allow_none=True)
-                    b = size_hint(common_shape[idx], allow_none=True)
-                    if a == 1 and b != 1:
-                        torch._check(shape[idx] == 1)
-                    if b == 1 and a != 1:
-                        torch._check(common_shape[idx] == 1)
                 if guard_or_false(shape[idx] == common_shape[idx]):
                     continue
-
-            if guard_or_false(common_shape[idx] == 1):
+            # PATCHED: two cases, if == for sure, no broadcast,
+            # otherwise maybe broadcast with max(dimensions)
+            if guard_or_false(common_shape[idx] != 1):
+                pass
+            elif guard_or_false(common_shape[idx] == 1) or guard_or_false(shape[idx] != 1):
                 if shape[idx] < 0:
                     raise ValueError("Attempting to broadcast a dimension with negative length!")
                 common_shape[idx] = shape[idx]
-
-            if not is_nested_int(shape[idx]) and guard_or_false(shape[idx] == 1):
-                # broadcast case .
-                continue
             else:
-                # If broadcasting is undecided we pick non-broadcast path and add runtime assertion.
-                torch._check(
-                    common_shape[idx] == shape[idx],
-                    lambda: f"Attempting to broadcast a dimension of length {shape[idx]} at {idx}! "
-                    f"Mismatching argument at index {arg_idx} had {shape}; but expected shape "
-                    f"should be broadcastable to {common_shape}",
-                )
+                common_shape[idx] = torch.sym_max(common_shape[idx], shape[idx])

     return common_shape

torch: _constrain_user_specified_dimhint_range -> patched__constrain_user_specified_dimhint_range¶

--- original
+++ rewritten
@@ -1,28 +1,31 @@
-def _constrain_user_specified_dimhint_range(
+def patched__constrain_user_specified_dimhint_range(
     symint: torch.SymInt,
     hint: int,
-    dim: _DimHint,
+    dim: "_DimHint",  # noqa: F821
     range_constraints,
     shape_env,
-    keypath: KeyPath,
+    keypath: "KeyPath",  # noqa: F821
     i: Optional[int] = None,
 ) -> Optional[str]:
+    """Patches ``torch._export.non_strict_utils._constrain_user_specified_dimhint_range``."""
+    from torch._export.non_strict_utils import is_int, int_oo, _DimHintType, ValueRanges
+
     trace_vr = (
         range_constraints[symint.node.expr]
         if not is_int(symint)
         else ValueRanges(int(symint), int(symint))
     )
-
     # warn on 0/1 specialization for Dim.AUTO; not an actual error
-    if dim.type == _DimHintType.AUTO and trace_vr.is_singleton() and hint in (0, 1):
-        pathstr = f"inputs{pytree.keystr(keypath)}"
-        if i is not None:
-            pathstr += f".shape[{i}]"
-        msg = (
-            f"dimension {pathstr} 0/1 specialized; Dim.AUTO was specified along "
-            + f"with a sample input with hint = {hint}."
-        )
-        log.warning(msg)
+    # PATCHED: remove logging
+    # if dim.type == _DimHintType.AUTO and trace_vr.is_singleton() and hint in (0, 1):
+    #    pathstr = f"inputs{pytree.keystr(keypath)}"
+    #    if i is not None:
+    #        pathstr += f".shape[{i}]"
+    #    msg = (
+    #        f"dimension {pathstr} 0/1 specialized; Dim.AUTO was specified along "
+    #        f"with a sample input with hint = {hint}."
+    #    )
+    #    log.warning(msg)

     try:
         user_vr = ValueRanges(
@@ -38,32 +41,40 @@

         # check for Dim.DYNAMIC specializations; special case error message on 0/1
         if dim.type == _DimHintType.DYNAMIC and out_vr.is_singleton():
-            path = f"inputs{pytree.keystr(keypath)}"
+            path = f"inputs{torch.utils._pytree.keystr(keypath)}"
             if i is not None:
                 path += f".shape[{i}]"
             if (
                 trace_vr.is_singleton()
                 and hint in (0, 1)
-                and not torch.fx.experimental._config.backed_size_oblivious
+                # PATCHED: line removed
+                # and not torch.fx.experimental._config.backed_size_oblivious
             ):
-                msg = (
-                    f"- Received user-specified dim hint Dim.DYNAMIC(min={dim.min}, max={dim.max}), "
-                    f"but export 0/1 specialized due to hint of {hint} for dimension {path}."
-                )
+                return None
+                # PATCHED: line removed
+                # msg = (
+                #     f"- Received user-specified dim hint "
+                #     f"Dim.DYNAMIC(min={dim.min}, max={dim.max}), "
+                #     f"but export 0/1 specialized due to hint of "
+                #     f"{hint} for dimension {path}."
+                # )
             else:
                 msg = (
-                    f"- Received user-specified dim hint Dim.DYNAMIC(min={dim.min}, max={dim.max}), "
-                    f"but tracing inferred a static shape of {out_vr.lower} for dimension {path}."
+                    f"- Received user-specified dim hint "
+                    f"Dim.DYNAMIC(min={dim.min}, max={dim.max}), "
+                    f"but tracing inferred a static shape of "
+                    f"{out_vr.lower} for dimension {path}."
                 )
             return msg

     except torch.utils._sympy.value_ranges.ValueRangeError:
-        path = f"inputs{pytree.keystr(keypath)}"
+        path = f"inputs{torch.utils._pytree.keystr(keypath)}"
         if i is not None:
             path += f".shape[{i}]"
         msg = (
             f"- Received user-specified min/max range of [{dim.min}, {dim.max}], "
-            f"conflicting with the inferred min/max range of [{trace_vr.lower}, {trace_vr.upper}], "
+            f"conflicting with the inferred min/max range of "
+            f"[{trace_vr.lower}, {trace_vr.upper}], "
             f"for {path}."
         )
         return msg

torch: _broadcast_in_dim_meta -> patched__broadcast_in_dim_meta¶

--- original
+++ rewritten
@@ -1,6 +1,9 @@
-def _broadcast_in_dim_meta(
-    a: TensorLikeType, shape: ShapeType, broadcast_dimensions: Sequence[int]
+def patched__broadcast_in_dim_meta(
+    a: torch._prims_common.TensorLikeType,
+    shape: torch._prims_common.ShapeType,
+    broadcast_dimensions: Sequence[int],
 ):
+    """Patches ``torch._prims._broadcast_in_dim_meta``."""
     from torch.fx.experimental.symbolic_shapes import (
         guard_or_false,
         guard_or_true,
@@ -8,7 +11,7 @@
     )

     # Type checks
-    assert isinstance(a, TensorLike)
+    assert isinstance(a, torch._prims_common.TensorLike)
     assert isinstance(shape, Sequence)
     assert isinstance(broadcast_dimensions, Sequence)

@@ -22,7 +25,7 @@
     # (no relative reordering of dims) of integers and
     # each dimension must be within the new shape
     def _greater_than_reduce(acc, x):
-        assert isinstance(x, Dim)
+        assert isinstance(x, (int, torch.export.Dim)), f"unexpected type {type(x)} for x"
         assert x > acc
         assert x < len(shape)

@@ -34,7 +37,9 @@
     for idx, new_idx in enumerate(broadcast_dimensions):
         torch._check(
             sym_or(a.shape[idx] == 1, shape[new_idx] == a.shape[idx]),
-            lambda: f"{a.shape[idx]} must be broadcastable to {shape[new_idx]}",
+            lambda idx=idx, new_idx=new_idx: (
+                f"{a.shape[idx]} must be broadcastable to {shape[new_idx]}"
+            ),
         )

     new_strides = []
@@ -48,10 +53,26 @@
                     new_strides.append(a.stride()[original_idx])
                 else:
                     new_strides.append(0)
+            # PATCHED: disabled this check
+            elif guard_or_false(a.shape[original_idx] != 1):
+                new_strides.append(a.stride()[original_idx])
             else:
+                # This checks generates the following issue:
+                # non-broadcasting semantics require s3 == Max(s10, s3), False,
+                # guard_or_false(a.shape[idx]==1)=False, a.stride()=(1, 2),
+                # idx=1, a.shape=torch.Size([2, s3]), shape=[2, Max(s10, s3)],
+                # original_idx=1
                 torch._check(
                     a.shape[original_idx] == shape[idx],
-                    lambda: f"non-broadcasting semantics require {a.shape[original_idx]} == {shape[idx]}",
+                    lambda idx=idx, original_idx=original_idx: (
+                        f"non-broadcasting semantics require "
+                        f"{a.shape[original_idx]} == {shape[idx]}, "
+                        f"{guard_or_false(a.shape[idx] != 1)}, "
+                        f"guard_or_false(a.shape[idx]==1)="
+                        f"{guard_or_false(a.shape[idx] == 1)}, "
+                        f"a.stride()={a.stride()}, idx={idx}, a.shape={a.shape}, "
+                        f"shape={shape}, original_idx={original_idx}"
+                    ),
                 )
                 new_strides.append(a.stride()[original_idx])
             original_idx = original_idx + 1

torch: _maybe_broadcast -> patched__maybe_broadcast¶

--- original
+++ rewritten
@@ -1,6 +1,9 @@
-def _maybe_broadcast(*args, preserve_cpu_scalar_tensors=True):
+def patched__maybe_broadcast(*args, preserve_cpu_scalar_tensors=True):
+    """Patches ``torch._refs._maybe_broadcast``."""
+    from torch._prims_common import ShapeType, TensorLike, Number
+
     # Computes common shape
-    common_shape = _broadcast_shapes(
+    common_shape = patched__broadcast_shapes(
         *(t.shape if isinstance(t, TensorLike) else None for t in args)
     )

@@ -29,10 +32,15 @@
                 return True

             # u0==u1 assume the same, no broadcasting!
-            torch._check(
-                x == y,
-                lambda: "sizes assumed to be the same due to unbacked broadcasting semantics",
-            )
+            # PATCHED: avoid errors
+            return True  # guard_or_true(x != y)
+            # torch._check(
+            #    x == y,
+            #    lambda x=x, y=y: (
+            #        f"sizes assumed to be the same due to unbacked "
+            #        f"broadcasting semantics x={x!r}, y={y!r}"
+            #    ),
+            # )

         return False

@@ -42,7 +50,7 @@
         elif isinstance(x, Number):
             return x
         elif isinstance(x, TensorLike):
-            if preserve_cpu_scalar_tensors and utils.is_cpu_scalar_tensor(x):
+            if preserve_cpu_scalar_tensors and torch._prims_common.is_cpu_scalar_tensor(x):
                 return x

             if should_expand(x.shape, common_shape):
@@ -50,6 +58,6 @@

             return x
         else:
-            raise RuntimeError("Unexpected type when broadcasting: " + str(type(x)) + "!")
+            raise RuntimeError(f"Unexpected type when broadcasting: {str(type(x))}!")

     return tuple(__maybe_broadcast(x, common_shape) for x in args)

torch: ShapeEnv._evaluate_expr -> patched_ShapeEnv._evaluate_expr¶

--- original
+++ rewritten
@@ -1,14 +1,24 @@
 def _evaluate_expr(
     self,
-    orig_expr: sympy.Basic,
+    orig_expr: "sympy.Basic",  # noqa: F821
     hint: Optional[Union[bool, int, float]] = None,
     fx_node: Optional[torch.fx.Node] = None,
     size_oblivious: bool = False,
     fallback_value: Optional[bool] = None,
     *,
     forcing_spec: bool = False,
-) -> sympy.Basic:
+) -> "sympy.Basic":  # noqa: F821
     # TODO: split conjunctions and evaluate them separately
+    import sympy
+    from torch.fx.experimental import _config as config
+    from torch.fx.experimental.symbolic_shapes import (
+        SympyBoolean,
+        log,
+        SymT,
+        symbol_is_type,
+    )
+    from torch._guards import ShapeGuard
+
     if isinstance(
         orig_expr,
         (sympy.logic.boolalg.BooleanTrue, sympy.logic.boolalg.BooleanFalse),
@@ -118,7 +128,8 @@
                     self._log_suppressed_dde(orig_expr, fallback_value)
                     return fallback_value

-                # oblivious_var_to_val will be defined iff we have sizes with DimDynamic.OBLIVIOUS_SIZE type.
+                # oblivious_var_to_val will be defined iff we have sizes
+                # with DimDynamic.OBLIVIOUS_SIZE type.
                 # See https://github.com/pytorch/pytorch/issues/137100#issuecomment-2495778113
                 if (
                     self.oblivious_var_to_val
@@ -136,15 +147,17 @@
                     log.info(
                         "oblivious_size %s -> %s (passed counterfactual)",
                         orig_expr,
+                        # pyrefly: ignore  # unbound-name
                         correct_hint,
                     )
-
+                    # pyrefly: ignore  # unbound-name
                     concrete_val = correct_hint
                     # NB: do NOT transmute into runtime assert
                     ok = True

                 # unbacked_var_to_val is not None iff propagate_real_tensors is on.
-                # if propagate_real_tensors is on, we check the example values to generate (unsound_result)
+                # if propagate_real_tensors is on, we check the example values
+                # to generate (unsound_result)
                 # and if they pass we add a runtime assertions and continue.
                 if (
                     not ok
@@ -155,25 +168,29 @@
                         )
                     ).free_symbols
                 ):
+                    # pyrefly: ignore  # unbound-name
                     self._log_real_tensor_propagation(orig_expr, unsound_result)
                     transmute_into_runtime_assert = True
-
+                    # pyrefly: ignore  # unbound-name
                     concrete_val = unsound_result
                     ok = True

-                # Check if this is coming from a python assert statement, if so, convert it to a runtime assertion
+                # Check if this is coming from a python assert statement,
+                # if so, convert it to a runtime assertion
                 # instead of failing.
                 if not ok and self.trace_asserts and self._is_python_assert():
                     concrete_val = sympy.true
                     transmute_into_runtime_assert = True
                     ok = True

-                if not ok:
-                    raise self._make_data_dependent_error(
-                        expr.xreplace(self.var_to_val),
-                        expr,
-                        expr_sym_node_id=self._expr_sym_node_id,
-                    )
+                # PATCHED: ok -> True
+                ok = True
+                # if not ok:
+                #    raise self._make_data_dependent_error(
+                #        expr.xreplace(self.var_to_val),
+                #        expr,
+                #        expr_sym_node_id=self._expr_sym_node_id,
+                #    )
             else:
                 expr = new_expr

patch_transformers: dynamic_rope_update -> patched_dynamic_rope_update¶

--- original
+++ rewritten
@@ -1,99 +1,193 @@
-def dynamic_rope_update(rope_forward):
-    """
-    Decorator function to update the RoPE parameters in the forward pass, if the model is using a dynamic RoPE
-    (i.e. a RoPE implementation that may recompute its frequencies in the forward pass).
+def patched_dynamic_rope_update(rope_forward):
+    """manual patch: ``[patch:transformers.modeling_rope_utils.dynamic_rope_update]``

-    Args:
-        rope_forward (Callable):
-            The forward pass of the RoPE implementation.
+    ``rope_type`` is determined in the constructor of class
+    :class:`transformers.models.phi3.modeling_phi3.Phi3RotaryEmbedding`.

-    Returns:
-        The decorated forward pass.
+    .. code-block:: python
+
+        if hasattr(config, "rope_scaling") and config.rope_scaling is not None:
+            self.rope_type = config.rope_scaling.get(
+                "rope_type", config.rope_scaling.get("type"))
+        else:
+            self.rope_type = "default"
+
+    The original code of the patched function:
+
+    .. code-block:: python
+
+        def dynamic_rope_update(rope_forward):
+            def longrope_frequency_update(self, position_ids, device):
+                seq_len = torch.max(position_ids) + 1
+                if hasattr(self.config, "original_max_position_embeddings"):
+                    original_max_position_embeddings =
+                        self.config.original_max_position_embeddings
+                else:
+                    original_max_position_embeddings =
+                        self.config.max_position_embeddings
+                if seq_len > original_max_position_embeddings:
+                    if not hasattr(self, "long_inv_freq"):
+                        self.long_inv_freq, _ = self.rope_init_fn(
+                            self.config, device, seq_len=original_max_position_embeddings + 1
+                        )
+                    self.register_buffer("inv_freq", self.long_inv_freq, persistent=False)
+                else:
+                    self.original_inv_freq = self.original_inv_freq.to(device)
+                    self.register_buffer("inv_freq", self.original_inv_freq, persistent=False)
+
+            def dynamic_frequency_update(self, position_ids, device):
+                seq_len = torch.max(position_ids) + 1
+                if seq_len > self.max_seq_len_cached:  # growth
+                    inv_freq, self.attention_scaling = self.rope_init_fn(
+                        self.config, device, seq_len=seq_len)
+                    self.register_buffer("inv_freq", inv_freq, persistent=False)
+                    self.max_seq_len_cached = seq_len
+
+                if seq_len < self.original_max_seq_len and
+                        self.max_seq_len_cached > self.original_max_seq_len:
+                    self.original_inv_freq = self.original_inv_freq.to(device)
+                    self.register_buffer("inv_freq", self.original_inv_freq, persistent=False)
+                    self.max_seq_len_cached = self.original_max_seq_len
+
+            @wraps(rope_forward)
+            def wrapper(self, x, position_ids):
+                if "dynamic" in self.rope_type:
+                    dynamic_frequency_update(self, position_ids, device=x.device)
+                elif self.rope_type == "longrope":
+                    longrope_frequency_update(self, position_ids, device=x.device)
+                return rope_forward(self, x, position_ids)
+
+            return wrapper
+
     """

     def longrope_frequency_update(self, position_ids, device, layer_type=None):
-        """Longrope uses long factor if sequence is larger than original pretraining length, short otherwise."""
+        # It is no use to patch the function after the model is created
+        # as rope_init_fn is an attribute set to one function when the model
+        # is created and when no patch is applied yet.
+        # So we select the patched version here.
+        rope_init_fn = _get_rope_init_fn(self, layer_type=layer_type)
         seq_len = torch.max(position_ids) + 1
-        original_max_position_embeddings = getattr(
-            self.config, "original_max_position_embeddings", self.config.max_position_embeddings
-        )
+        if hasattr(self.config, "original_max_position_embeddings"):
+            original_max_position_embeddings = self.config.original_max_position_embeddings
+        else:
+            original_max_position_embeddings = self.config.max_position_embeddings
+
         if layer_type is None:
-            rope_type = self.rope_type
+            # rope_type = self.rope_type
             original_inv_freq = self.original_inv_freq
             prefix = ""
         else:
-            rope_type = self.rope_type[layer_type]
+            # rope_type = self.rope_type[layer_type]
             original_inv_freq = getattr(self, f"{layer_type}_original_inv_freq")
             prefix = f"{layer_type}_"

-        if seq_len > original_max_position_embeddings:
-            if not hasattr(self, f"{layer_type}_long_inv_freq"):
-                rope_init_fn = ROPE_INIT_FUNCTIONS[rope_type]
-                long_inv_freq, _ = rope_init_fn(
-                    self.config,
-                    device,
-                    seq_len=original_max_position_embeddings + 1,
-                    layer_type=layer_type,
-                )
-            self.register_buffer(f"{prefix}inv_freq", long_inv_freq, persistent=False)
-            setattr(self, f"{prefix}long_inv_freq", long_inv_freq)
-        else:
-            # This .to() is needed if the model has been moved to a device after being initialized (because
-            # the buffer is automatically moved, but not the original copy)
-            original_inv_freq = original_inv_freq.to(device)
-            self.register_buffer(f"{prefix}inv_freq", original_inv_freq, persistent=False)
-            setattr(self, f"{prefix}original_inv_freq", original_inv_freq)
+        # At export time, seq_len is unknown.
+        long_inv_freq, _ = rope_init_fn(
+            self.config, device, seq_len=original_max_position_embeddings + 1
+        )
+        original_inv_freq = self.original_inv_freq.to(device)
+
+        # PATCHED: uses torch.cond instead of a test
+        cond = (seq_len > original_max_position_embeddings).item()
+        inv_freq = torch.cond(
+            cond,
+            (lambda x, y: x.clone()),
+            (lambda x, y: y.clone()),
+            [long_inv_freq, original_inv_freq],
+        )
+        setattr(self, f"{prefix}inv_freq", inv_freq)
+        # if seq_len > original_max_position_embeddings:
+        #    self.inv_freq = self.long_inv_freq
+        # else:
+        #    self.inv_freq = self.original_inv_freq

     def dynamic_frequency_update(self, position_ids, device, layer_type=None):
-        """
-        dynamic RoPE layers should recompute `inv_freq` in the following situations:
-        1 - growing beyond the cached sequence length (allow scaling)
-        2 - the current sequence length is in the original scale (avoid losing precision with small sequences)
-        """
+        # constructor:
+        # - self.max_seq_len_cached = config.max_position_embeddings
+        # - self.original_max_seq_len = config.max_position_embeddings
+        # - inv_freq, self.attention_scaling = self.rope_init_fn(self.config, device)
+
+        # It is no use to patch the function after the model is created
+        # as rope_init_fn is an attribute set to one function when the model
+        # is created and when no patch is applied yet.
+        # So we select the patched version here.
+        rope_init_fn = _get_rope_init_fn(self, layer_type=layer_type)
+
+        # This behaviour is difficult to translate.
+        # The sequence always grows.
+        # The test should always True.
+        # So:  self.max_seq_len_cached = max(self.max_seq_len_cached, seq_len) --> seq_len
+        #
+        # if seq_len > self.max_seq_len_cached:  # growth
+        #    inv_freq, self.attention_scaling = self.rope_init_fn(
+        #        self.config, device, seq_len=seq_len
+        #    )
+        #    self.register_buffer("inv_freq", inv_freq, persistent=False)
+        #    self.max_seq_len_cached = seq_len
+        #
+        # So we should not need what follows.
+        #
+        # cond = (seq_len > self.max_seq_len_cached).item()
+        # self.attention_scaling = torch.cond(
+        #    cond,
+        #    (lambda x, y: x.clone()),
+        #    (lambda x, y: y.clone()),
+        #    [attention_scaling, self.attention_scaling],
+        # )
+
         seq_len = torch.max(position_ids) + 1
+        long_inv_freq, self.attention_scaling = rope_init_fn(self.config, device, seq_len=seq_len)
+
         if layer_type is None:
-            rope_type = self.rope_type
-            max_seq_len_cached = self.max_seq_len_cached
+            # rope_type = self.rope_type
+            # max_seq_len_cached = self.max_seq_len_cached
             original_inv_freq = self.original_inv_freq
             prefix = ""
         else:
-            rope_type = self.rope_type[layer_type]
-            max_seq_len_cached = getattr(
-                self, f"{layer_type}_max_seq_len_cached", self.max_seq_len_cached
-            )
+            # rope_type = self.rope_type[layer_type]
+            # max_seq_len_cached = getattr(
+            #     self, f"{layer_type}_max_seq_len_cached", self.max_seq_len_cached
+            # )
             original_inv_freq = getattr(self, f"{layer_type}_original_inv_freq")
             prefix = f"{layer_type}_"

-        if seq_len > max_seq_len_cached:  # growth
-            rope_init_fn = ROPE_INIT_FUNCTIONS[rope_type]
-            inv_freq, self.attention_scaling = rope_init_fn(
-                self.config,
-                device,
-                seq_len=seq_len,
-                layer_type=layer_type,
-            )
-            # TODO joao: may break with compilation
-            self.register_buffer(f"{prefix}inv_freq", inv_freq, persistent=False)
-            setattr(self, f"{layer_type}_max_seq_len_cached", seq_len)
+        # Second test to translate.
+        # Let's keep in mind, self.max_seq_len_cached = seq_len is likely to be True.
+        # But in that case the following condition is a way to restore the original cache.

-        if (
-            seq_len < self.original_max_seq_len and max_seq_len_cached > self.original_max_seq_len
-        ):  # reset
-            # This .to() is needed if the model has been moved to a device after being initialized (because
-            # the buffer is automatically moved, but not the original copy)
-            original_inv_freq = original_inv_freq.to(device)
-            self.register_buffer(f"{prefix}inv_freq", original_inv_freq, persistent=False)
-            setattr(self, f"{prefix}original_inv_freq", original_inv_freq)
-            setattr(self, f"{layer_type}_max_seq_len_cached", self.original_max_seq_len)
+        # if (
+        #    seq_len < self.original_max_seq_len
+        #    and self.max_seq_len_cached > self.original_max_seq_len
+        # ):
+        #    self.original_inv_freq = self.original_inv_freq.to(device)
+        #    self.register_buffer("inv_freq", self.original_inv_freq, persistent=False)
+        #    self.max_seq_len_cached = self.original_max_seq_len
+
+        original_inv_freq = self.original_inv_freq.to(device)
+        cond = (seq_len >= self.original_max_seq_len).item()
+        # PATCHED: uses torch.cond instead of a test
+        inv_freq = torch.cond(
+            cond,
+            (lambda x, y: x.clone()),
+            (lambda x, y: y.clone()),
+            [long_inv_freq, original_inv_freq],
+        )
+        setattr(self, f"{prefix}inv_freq", inv_freq)

     @wraps(rope_forward)
     def wrapper(self, x, position_ids, layer_type=None):
-        rope_type = self.rope_type if layer_type is None else self.rope_type[layer_type]
-        kwargs = {"layer_type": layer_type} if layer_type is not None else {}
-        if "dynamic" in rope_type:
-            dynamic_frequency_update(self, position_ids, device=x.device, **kwargs)
-        elif rope_type == "longrope":
-            longrope_frequency_update(self, position_ids, device=x.device, **kwargs)
-        return rope_forward(self, x, position_ids, **kwargs)
+        if layer_type is None:
+            if "dynamic" in self.rope_type:
+                dynamic_frequency_update(self, position_ids, device=x.device)
+            elif self.rope_type == "longrope":
+                longrope_frequency_update(self, position_ids, device=x.device)
+            return rope_forward(self, x, position_ids)
+
+        if "dynamic" in self.rope_type:
+            dynamic_frequency_update(self, position_ids, device=x.device, layer_type=layer_type)
+        elif self.rope_type == "longrope":
+            longrope_frequency_update(self, position_ids, device=x.device, layer_type=layer_type)
+        return rope_forward(self, x, position_ids, layer_type=layer_type)

     return wrapper

transformers: eager_mask -> patched_eager_mask¶

--- original
+++ rewritten
@@ -1,4 +1,4 @@
-def eager_mask(
+def patched_eager_mask(
     batch_size: int,
     cache_position: torch.Tensor,
     kv_length: int,
@@ -6,38 +6,14 @@
     mask_function: Callable = causal_mask_function,
     attention_mask: Optional[torch.Tensor] = None,
     dtype: torch.dtype = torch.float32,
-    use_vmap: bool = False,
     **kwargs,
 ) -> torch.Tensor:
-    """
-    Create a 4D float mask of shape `(batch_size, 1, query_length, kv_length)` where a value of 0 indicates that
-    the element should take part in the attention computation, and -inf (minimum value for the given `dtype`) that
-    it should not.
-
-    Args:
-        batch_size (`int`):
-            The batch size of the input sequence.
-        cache_position (`torch.Tensor`):
-            A tensor of shape (query_length,) indicating the current indices of the input sequence elements.
-        kv_length (`int`):
-            The size that the key and value states will have during the attention computation.
-        kv_offset (`int`, optional):
-            An optional offset to indicate at which first position the key and values states will refer to.
-        mask_function (`Callable`):
-            The mask factory function describing the mask pattern.
-        attention_mask (`torch.Tensor`, optional):
-            The 2D attention mask corresponding to padded tokens of shape (batch_size, number_of_seen_tokens+q_length)
-        dtype (`torch.dtype`, optional):
-            The dtype to use for the mask. By default, `torch.float32`.
-        use_vmap (`bool`, optional):
-            Whether to use `vmap` during the mask construction or not. Allows powerful custom patterns that may not be
-            index-based (for the cost of speed performance). By default `False`.
-    """
+    """manual patch for function ``transformers.masking_utils.eager_mask``."""
     # The masks for eager attention are simply boolean mask from sdpa, casted to 0 and -inf
     _ = kwargs.pop("allow_is_causal_skip", None)
     _ = kwargs.pop("allow_is_bidirectional_skip", None)
-    _ = kwargs.pop("allow_torch_fix", None)
-    mask = sdpa_mask(
+    # PATCHED: this line called the patched version of sdpa_mask
+    mask = patched_sdpa_mask_recent_torch(
         batch_size=batch_size,
         cache_position=cache_position,
         kv_length=kv_length,
@@ -47,10 +23,13 @@
         allow_is_causal_skip=False,
         allow_is_bidirectional_skip=False,
         allow_torch_fix=False,
-        use_vmap=use_vmap,
         **kwargs,
     )
     min_dtype = torch.finfo(dtype).min
-    # we need 0s where the tokens should be taken into account, and -inf otherwise (mask is already of boolean type)
-    mask = torch.where(mask, torch.tensor(0.0, device=mask.device, dtype=dtype), min_dtype)
+    # PATCHED: the following line
+    # we need 0s where the tokens should be taken into account,
+    # and -inf otherwise (mask is already of boolean type)
+    # mask =
+    #   torch.where(mask, torch.tensor(0.0, device=mask.device, dtype=dtype), min_dtype)
+    mask = (~mask).to(dtype) * min_dtype
     return mask

transformers: sdpa_attention_forward -> patched_sdpa_attention_forward¶

--- original
+++ rewritten
@@ -1,4 +1,4 @@
-def sdpa_attention_forward(
+def patched_sdpa_attention_forward(
     module: torch.nn.Module,
     query: torch.Tensor,
     key: torch.Tensor,
@@ -9,57 +9,133 @@
     is_causal: Optional[bool] = None,
     **kwargs,
 ) -> tuple[torch.Tensor, None]:
-    if kwargs.get("output_attentions", False):
-        logger.warning_once(
-            "`sdpa` attention does not support `output_attentions=True`."
-            " Please set your attention to `eager` if you want any of these features."
-        )
+    """
+    manual patch for function
+    ``transformers.integrations.sdpa_attention.sdpa_attention_forward``
+    """
+    assert not kwargs.get("output_attentions", False), (
+        "`sdpa` attention does not support `output_attentions=True`."
+        " Please set your attention to `eager` if you want any of these features."
+    )
+    torch._check(
+        query.shape[0] == key.shape[0] or query.shape[0] == 1,
+        lambda: (
+            f"broadcast issue query (1): {query.shape}, key: {key.shape}, "
+            f"value: {value.shape}"
+        ),
+    )
+    torch._check(
+        key.shape[0] == value.shape[0] or key.shape[0] == 1,
+        lambda: (
+            f"broadcast issue query (2): {query.shape}, key: {key.shape}, "
+            f"value: {value.shape}"
+        ),
+    )
+
     sdpa_kwargs = {}
     if hasattr(module, "num_key_value_groups"):
-        if not use_gqa_in_sdpa(attention_mask, key):
-            key = repeat_kv(key, module.num_key_value_groups)
-            value = repeat_kv(value, module.num_key_value_groups)
+        if not transformers.integrations.sdpa_attention.use_gqa_in_sdpa(attention_mask, key):
+            key = transformers.integrations.sdpa_attention.repeat_kv(
+                key, module.num_key_value_groups
+            )
+            value = transformers.integrations.sdpa_attention.repeat_kv(
+                value, module.num_key_value_groups
+            )
         else:
             sdpa_kwargs = {"enable_gqa": True}

-    # Instead of relying on the value set in the module directly, we use the is_causal passed in kwargs if it is presented
-    is_causal = is_causal if is_causal is not None else getattr(module, "is_causal", True)
+    if attention_mask is not None and attention_mask.ndim == 4:
+        attention_mask = attention_mask[:, :, :, : key.shape[-2]]

-    # SDPA's Flash Attention (and cuDNN) kernels rely on the `is_causal` flag. However, there are certain conditions:
-    # - Not in decoding phase (otherwise we want full attention on the single query token)
-    # - Attention mask is not to be provided (even if it is a causal pattern)
-    # - Internally, we marked this as compatible with causal, i.e. it is a decoder attention type
-    #
-    # Quirks on the conditionals:
-    # - We avoid inline passing this to the SDPA function directly to support both torch.compile's dynamic shapes and
-    #   full graph options. Otherwise, dynamic shapes are prevented from compiling.
-    # - It is important to check first for the shape, otherwise compile will fail with
-    #   `argument 'is_causal' must be bool, not SymBool`.
-    is_causal = query.shape[2] > 1 and attention_mask is None and is_causal
+    torch._check(
+        attention_mask is None or attention_mask.shape[3] == key.shape[2],
+        lambda: "Attention mask shape incompatible with key shape.",
+    )

-    # Shapes (e.g. query.shape[2]) are tensors during jit tracing, resulting in `is_causal` being a tensor.
-    # We convert it to a bool for the SDPA kernel that only accepts bools.
-    if torch.jit.is_tracing() and isinstance(is_causal, torch.Tensor):
-        is_causal = is_causal.item()
+    if patch_sdpa_is_causal:
+        # transformers>=4.55
+        is_causal = is_causal if is_causal is not None else getattr(module, "is_causal", True)

-    # When `is_causal = False` and the `attention_mask` is not of boolean type, the Ascend NPU's SDPA interface cannot utilize the FlashAttentionScore operator，
-    # and falls back to small-operator concatenation. To invoke the FlashAttentionScore, the attention_mask must be converted to boolean type.
-    # This adaptation ensures the `attention_mask` meets the requirement for using FlashAttentionScore.
-    if _is_torch_npu_available:
-        if attention_mask is not None and attention_mask.dtype != torch.bool:
-            # Convert to boolean type, making sdpa to force call FlashAttentionScore to improve performance.
-            attention_mask = torch.logical_not(attention_mask.bool()).to(query.device)
+        # PATCHED: remove the test query.shape[2] > 1
+        # is_causal = query.shape[2] > 1 and attention_mask is None and is_causal
+        # and we split the test to keep the minimum in torch.cond
+        is_causal = attention_mask is None and is_causal

-    attn_output = torch.nn.functional.scaled_dot_product_attention(
-        query,
-        key,
-        value,
-        attn_mask=attention_mask,
-        dropout_p=dropout,
-        scale=scaling,
-        is_causal=is_causal,
-        **sdpa_kwargs,
+        if not is_causal:
+            torch._check(query.shape[0] > 0)
+            torch._check(query.shape[1] > 0)
+            torch._check(query.shape[2] > 0)
+            torch._check(query.shape[3] > 0)
+            torch._check(key.shape[0] > 0)
+            torch._check(key.shape[1] > 0)
+            torch._check(key.shape[2] > 0)
+            torch._check(key.shape[3] > 0)
+            torch._check(value.shape[0] > 0)
+            torch._check(value.shape[1] > 0)
+            torch._check(value.shape[2] > 0)
+            torch._check(value.shape[3] > 0)
+            return (
+                torch.nn.functional.scaled_dot_product_attention(
+                    query,
+                    key,
+                    value,
+                    attn_mask=attention_mask,
+                    dropout_p=dropout,
+                    scale=scaling,
+                    is_causal=is_causal,
+                    **sdpa_kwargs,
+                )
+                .transpose(1, 2)
+                .contiguous(),
+                None,
+            )
+    else:
+        # transformers<4.55
+        if is_causal is None and attention_mask is not None:
+            is_causal = False
+        if is_causal is not None:
+            return (
+                torch.nn.functional.scaled_dot_product_attention(
+                    query,
+                    key,
+                    value,
+                    attn_mask=attention_mask,
+                    dropout_p=dropout,
+                    scale=scaling,
+                    is_causal=is_causal,
+                    **sdpa_kwargs,
+                )
+                .transpose(1, 2)
+                .contiguous(),
+                None,
+            )
+
+    # To avoid the following errors:
+    # is_causal=query.shape[2] > 1
+    # TypeError: scaled_dot_product_attention(): argument 'is_causal' must be bool, not SymBool
+    # is_causal=torch.tensor(query.shape[2] > 1)
+    # TypeError: scaled_dot_product_attention(): argument 'is_causal' must be bool, not Tensor
+    attn_output = torch.cond(
+        query.shape[2] > 1,  # distinction between prefill and decoding steps
+        lambda query, key, value: torch.nn.functional.scaled_dot_product_attention(
+            query,
+            key,
+            value,
+            dropout_p=dropout,
+            scale=scaling,
+            is_causal=True,
+            **sdpa_kwargs,
+        ).contiguous(),
+        lambda query, key, value: torch.nn.functional.scaled_dot_product_attention(
+            query,
+            key,
+            value,
+            dropout_p=dropout,
+            scale=scaling,
+            is_causal=False,
+            **sdpa_kwargs,
+        ).contiguous(),
+        [query, key, value],
     )
     attn_output = attn_output.transpose(1, 2).contiguous()
-
     return attn_output, None

auto/patch_transformers: DynamicLayer.lazy_initialization -> patched_DynamicLayer.lazy_initialization¶

--- original
+++ rewritten
@@ -1,5 +1,9 @@
 def lazy_initialization(self, key_states: torch.Tensor):
     self.dtype, self.device = key_states.dtype, key_states.device
-    self.keys = torch.tensor([], dtype=self.dtype, device=self.device)
-    self.values = torch.tensor([], dtype=self.dtype, device=self.device)
-    self.is_initialized = True
+    new_shape = list(key_states.shape)
+    new_shape[-2] = 0
+    # PATCHED: used a tensor with an empty shape and not en empty list to initialize
+    self.keys = torch.empty(new_shape, dtype=self.dtype, device=self.device)
+    self.values = torch.empty(new_shape, dtype=self.dtype, device=self.device)
+    if patch_is_initialized:
+        self.is_initialized = True

auto/patch_transformers: Gemma2RotaryEmbedding.forward -> common_RotaryEmbedding.forward¶

--- original
+++ rewritten
@@ -1,8 +1,16 @@
-@torch.no_grad()
-@dynamic_rope_update  # power user: used with advanced RoPE types (e.g. dynamic rope)
-def forward(self, x, position_ids):
+@patched_dynamic_rope_update
+def forward(self, x, position_ids, layer_type=None):
+    if layer_type is not None:
+        # transformers>=5.0
+        inv_freq = getattr(self, f"{layer_type}_inv_freq")
+        attention_scaling = getattr(self, f"{layer_type}_attention_scaling")
+    else:
+        # transformers<5.0
+        inv_freq = self.inv_freq
+        attention_scaling = self.attention_scaling
+
     inv_freq_expanded = (
-        self.inv_freq[None, :, None].float().expand(position_ids.shape[0], -1, 1).to(x.device)
+        inv_freq[None, :, None].float().expand(position_ids.shape[0], -1, 1).to(x.device)
     )
     position_ids_expanded = position_ids[:, None, :].float()

@@ -12,7 +20,7 @@
     with torch.autocast(device_type=device_type, enabled=False):  # Force float32
         freqs = (inv_freq_expanded.float() @ position_ids_expanded.float()).transpose(1, 2)
         emb = torch.cat((freqs, freqs), dim=-1)
-        cos = emb.cos() * self.attention_scaling
-        sin = emb.sin() * self.attention_scaling
+        cos = emb.cos() * attention_scaling
+        sin = emb.sin() * attention_scaling

     return cos.to(dtype=x.dtype), sin.to(dtype=x.dtype)

--- original
+++ rewritten
@@ -1,99 +1,193 @@
-def dynamic_rope_update(rope_forward):
-    """
-    Decorator function to update the RoPE parameters in the forward pass, if the model is using a dynamic RoPE
-    (i.e. a RoPE implementation that may recompute its frequencies in the forward pass).
+def patched_dynamic_rope_update(rope_forward):
+    """manual patch: ``[patch:transformers.modeling_rope_utils.dynamic_rope_update]``

-    Args:
-        rope_forward (Callable):
-            The forward pass of the RoPE implementation.
+    ``rope_type`` is determined in the constructor of class
+    :class:`transformers.models.phi3.modeling_phi3.Phi3RotaryEmbedding`.

-    Returns:
-        The decorated forward pass.
+    .. code-block:: python
+
+        if hasattr(config, "rope_scaling") and config.rope_scaling is not None:
+            self.rope_type = config.rope_scaling.get(
+                "rope_type", config.rope_scaling.get("type"))
+        else:
+            self.rope_type = "default"
+
+    The original code of the patched function:
+
+    .. code-block:: python
+
+        def dynamic_rope_update(rope_forward):
+            def longrope_frequency_update(self, position_ids, device):
+                seq_len = torch.max(position_ids) + 1
+                if hasattr(self.config, "original_max_position_embeddings"):
+                    original_max_position_embeddings =
+                        self.config.original_max_position_embeddings
+                else:
+                    original_max_position_embeddings =
+                        self.config.max_position_embeddings
+                if seq_len > original_max_position_embeddings:
+                    if not hasattr(self, "long_inv_freq"):
+                        self.long_inv_freq, _ = self.rope_init_fn(
+                            self.config, device, seq_len=original_max_position_embeddings + 1
+                        )
+                    self.register_buffer("inv_freq", self.long_inv_freq, persistent=False)
+                else:
+                    self.original_inv_freq = self.original_inv_freq.to(device)
+                    self.register_buffer("inv_freq", self.original_inv_freq, persistent=False)
+
+            def dynamic_frequency_update(self, position_ids, device):
+                seq_len = torch.max(position_ids) + 1
+                if seq_len > self.max_seq_len_cached:  # growth
+                    inv_freq, self.attention_scaling = self.rope_init_fn(
+                        self.config, device, seq_len=seq_len)
+                    self.register_buffer("inv_freq", inv_freq, persistent=False)
+                    self.max_seq_len_cached = seq_len
+
+                if seq_len < self.original_max_seq_len and
+                        self.max_seq_len_cached > self.original_max_seq_len:
+                    self.original_inv_freq = self.original_inv_freq.to(device)
+                    self.register_buffer("inv_freq", self.original_inv_freq, persistent=False)
+                    self.max_seq_len_cached = self.original_max_seq_len
+
+            @wraps(rope_forward)
+            def wrapper(self, x, position_ids):
+                if "dynamic" in self.rope_type:
+                    dynamic_frequency_update(self, position_ids, device=x.device)
+                elif self.rope_type == "longrope":
+                    longrope_frequency_update(self, position_ids, device=x.device)
+                return rope_forward(self, x, position_ids)
+
+            return wrapper
+
     """

     def longrope_frequency_update(self, position_ids, device, layer_type=None):
-        """Longrope uses long factor if sequence is larger than original pretraining length, short otherwise."""
+        # It is no use to patch the function after the model is created
+        # as rope_init_fn is an attribute set to one function when the model
+        # is created and when no patch is applied yet.
+        # So we select the patched version here.
+        rope_init_fn = _get_rope_init_fn(self, layer_type=layer_type)
         seq_len = torch.max(position_ids) + 1
-        original_max_position_embeddings = getattr(
-            self.config, "original_max_position_embeddings", self.config.max_position_embeddings
-        )
+        if hasattr(self.config, "original_max_position_embeddings"):
+            original_max_position_embeddings = self.config.original_max_position_embeddings
+        else:
+            original_max_position_embeddings = self.config.max_position_embeddings
+
         if layer_type is None:
-            rope_type = self.rope_type
+            # rope_type = self.rope_type
             original_inv_freq = self.original_inv_freq
             prefix = ""
         else:
-            rope_type = self.rope_type[layer_type]
+            # rope_type = self.rope_type[layer_type]
             original_inv_freq = getattr(self, f"{layer_type}_original_inv_freq")
             prefix = f"{layer_type}_"

-        if seq_len > original_max_position_embeddings:
-            if not hasattr(self, f"{layer_type}_long_inv_freq"):
-                rope_init_fn = ROPE_INIT_FUNCTIONS[rope_type]
-                long_inv_freq, _ = rope_init_fn(
-                    self.config,
-                    device,
-                    seq_len=original_max_position_embeddings + 1,
-                    layer_type=layer_type,
-                )
-            self.register_buffer(f"{prefix}inv_freq", long_inv_freq, persistent=False)
-            setattr(self, f"{prefix}long_inv_freq", long_inv_freq)
-        else:
-            # This .to() is needed if the model has been moved to a device after being initialized (because
-            # the buffer is automatically moved, but not the original copy)
-            original_inv_freq = original_inv_freq.to(device)
-            self.register_buffer(f"{prefix}inv_freq", original_inv_freq, persistent=False)
-            setattr(self, f"{prefix}original_inv_freq", original_inv_freq)
+        # At export time, seq_len is unknown.
+        long_inv_freq, _ = rope_init_fn(
+            self.config, device, seq_len=original_max_position_embeddings + 1
+        )
+        original_inv_freq = self.original_inv_freq.to(device)
+
+        # PATCHED: uses torch.cond instead of a test
+        cond = (seq_len > original_max_position_embeddings).item()
+        inv_freq = torch.cond(
+            cond,
+            (lambda x, y: x.clone()),
+            (lambda x, y: y.clone()),
+            [long_inv_freq, original_inv_freq],
+        )
+        setattr(self, f"{prefix}inv_freq", inv_freq)
+        # if seq_len > original_max_position_embeddings:
+        #    self.inv_freq = self.long_inv_freq
+        # else:
+        #    self.inv_freq = self.original_inv_freq

     def dynamic_frequency_update(self, position_ids, device, layer_type=None):
-        """
-        dynamic RoPE layers should recompute `inv_freq` in the following situations:
-        1 - growing beyond the cached sequence length (allow scaling)
-        2 - the current sequence length is in the original scale (avoid losing precision with small sequences)
-        """
+        # constructor:
+        # - self.max_seq_len_cached = config.max_position_embeddings
+        # - self.original_max_seq_len = config.max_position_embeddings
+        # - inv_freq, self.attention_scaling = self.rope_init_fn(self.config, device)
+
+        # It is no use to patch the function after the model is created
+        # as rope_init_fn is an attribute set to one function when the model
+        # is created and when no patch is applied yet.
+        # So we select the patched version here.
+        rope_init_fn = _get_rope_init_fn(self, layer_type=layer_type)
+
+        # This behaviour is difficult to translate.
+        # The sequence always grows.
+        # The test should always True.
+        # So:  self.max_seq_len_cached = max(self.max_seq_len_cached, seq_len) --> seq_len
+        #
+        # if seq_len > self.max_seq_len_cached:  # growth
+        #    inv_freq, self.attention_scaling = self.rope_init_fn(
+        #        self.config, device, seq_len=seq_len
+        #    )
+        #    self.register_buffer("inv_freq", inv_freq, persistent=False)
+        #    self.max_seq_len_cached = seq_len
+        #
+        # So we should not need what follows.
+        #
+        # cond = (seq_len > self.max_seq_len_cached).item()
+        # self.attention_scaling = torch.cond(
+        #    cond,
+        #    (lambda x, y: x.clone()),
+        #    (lambda x, y: y.clone()),
+        #    [attention_scaling, self.attention_scaling],
+        # )
+
         seq_len = torch.max(position_ids) + 1
+        long_inv_freq, self.attention_scaling = rope_init_fn(self.config, device, seq_len=seq_len)
+
         if layer_type is None:
-            rope_type = self.rope_type
-            max_seq_len_cached = self.max_seq_len_cached
+            # rope_type = self.rope_type
+            # max_seq_len_cached = self.max_seq_len_cached
             original_inv_freq = self.original_inv_freq
             prefix = ""
         else:
-            rope_type = self.rope_type[layer_type]
-            max_seq_len_cached = getattr(
-                self, f"{layer_type}_max_seq_len_cached", self.max_seq_len_cached
-            )
+            # rope_type = self.rope_type[layer_type]
+            # max_seq_len_cached = getattr(
+            #     self, f"{layer_type}_max_seq_len_cached", self.max_seq_len_cached
+            # )
             original_inv_freq = getattr(self, f"{layer_type}_original_inv_freq")
             prefix = f"{layer_type}_"

-        if seq_len > max_seq_len_cached:  # growth
-            rope_init_fn = ROPE_INIT_FUNCTIONS[rope_type]
-            inv_freq, self.attention_scaling = rope_init_fn(
-                self.config,
-                device,
-                seq_len=seq_len,
-                layer_type=layer_type,
-            )
-            # TODO joao: may break with compilation
-            self.register_buffer(f"{prefix}inv_freq", inv_freq, persistent=False)
-            setattr(self, f"{layer_type}_max_seq_len_cached", seq_len)
+        # Second test to translate.
+        # Let's keep in mind, self.max_seq_len_cached = seq_len is likely to be True.
+        # But in that case the following condition is a way to restore the original cache.

-        if (
-            seq_len < self.original_max_seq_len and max_seq_len_cached > self.original_max_seq_len
-        ):  # reset
-            # This .to() is needed if the model has been moved to a device after being initialized (because
-            # the buffer is automatically moved, but not the original copy)
-            original_inv_freq = original_inv_freq.to(device)
-            self.register_buffer(f"{prefix}inv_freq", original_inv_freq, persistent=False)
-            setattr(self, f"{prefix}original_inv_freq", original_inv_freq)
-            setattr(self, f"{layer_type}_max_seq_len_cached", self.original_max_seq_len)
+        # if (
+        #    seq_len < self.original_max_seq_len
+        #    and self.max_seq_len_cached > self.original_max_seq_len
+        # ):
+        #    self.original_inv_freq = self.original_inv_freq.to(device)
+        #    self.register_buffer("inv_freq", self.original_inv_freq, persistent=False)
+        #    self.max_seq_len_cached = self.original_max_seq_len
+
+        original_inv_freq = self.original_inv_freq.to(device)
+        cond = (seq_len >= self.original_max_seq_len).item()
+        # PATCHED: uses torch.cond instead of a test
+        inv_freq = torch.cond(
+            cond,
+            (lambda x, y: x.clone()),
+            (lambda x, y: y.clone()),
+            [long_inv_freq, original_inv_freq],
+        )
+        setattr(self, f"{prefix}inv_freq", inv_freq)

     @wraps(rope_forward)
     def wrapper(self, x, position_ids, layer_type=None):
-        rope_type = self.rope_type if layer_type is None else self.rope_type[layer_type]
-        kwargs = {"layer_type": layer_type} if layer_type is not None else {}
-        if "dynamic" in rope_type:
-            dynamic_frequency_update(self, position_ids, device=x.device, **kwargs)
-        elif rope_type == "longrope":
-            longrope_frequency_update(self, position_ids, device=x.device, **kwargs)
-        return rope_forward(self, x, position_ids, **kwargs)
+        if layer_type is None:
+            if "dynamic" in self.rope_type:
+                dynamic_frequency_update(self, position_ids, device=x.device)
+            elif self.rope_type == "longrope":
+                longrope_frequency_update(self, position_ids, device=x.device)
+            return rope_forward(self, x, position_ids)
+
+        if "dynamic" in self.rope_type:
+            dynamic_frequency_update(self, position_ids, device=x.device, layer_type=layer_type)
+        elif self.rope_type == "longrope":
+            longrope_frequency_update(self, position_ids, device=x.device, layer_type=layer_type)
+        return rope_forward(self, x, position_ids, layer_type=layer_type)

     return wrapper

auto/patch_transformers: Gemma3Model.get_placeholder_mask -> patched_Gemma3Model.get_placeholder_mask¶

--- original
+++ rewritten
@@ -4,14 +4,12 @@
     inputs_embeds: torch.FloatTensor,
     image_features: torch.FloatTensor,
 ):
-    """
-    Obtains multimodal placeholder mask from `input_ids` or `inputs_embeds`, and checks that the placeholder token count is
-    equal to the length of multimodal features. If the lengths are different, an error is raised.
-    """
     if input_ids is None:
         special_image_mask = inputs_embeds == self.get_input_embeddings()(
             torch.tensor(
-                self.config.image_token_id, dtype=torch.long, device=inputs_embeds.device
+                self.config.image_token_id,
+                dtype=torch.long,
+                device=inputs_embeds.device,
             )
         )
         special_image_mask = special_image_mask.all(-1)
@@ -23,8 +21,14 @@
         special_image_mask.unsqueeze(-1).expand_as(inputs_embeds).to(inputs_embeds.device)
     )
     n_image_features = image_features.shape[0] * image_features.shape[1]
-    if inputs_embeds[special_image_mask].numel() != image_features.numel():
-        raise ValueError(
-            f"Image features and image tokens do not match: tokens: {n_image_tokens}, features {n_image_features}"
-        )
+    # PATCHED: torch._check
+    # if inputs_embeds[special_image_mask].numel() != image_features.numel():
+    #    raise ValueError( ... )
+    torch._check(
+        inputs_embeds[special_image_mask].numel() == image_features.numel(),
+        lambda: (
+            f"Image features and image tokens do not match: tokens: "
+            f"{n_image_tokens}, features {n_image_features}"
+        ),
+    )
     return special_image_mask

auto/patch_transformers: Gemma3RotaryEmbedding.forward -> common_RotaryEmbedding.forward¶

--- original
+++ rewritten
@@ -1,8 +1,13 @@
-@torch.no_grad()
-@dynamic_rope_update  # power user: used with advanced RoPE types (e.g. dynamic rope)
+@patched_dynamic_rope_update
 def forward(self, x, position_ids, layer_type=None):
-    inv_freq = getattr(self, f"{layer_type}_inv_freq")
-    attention_scaling = getattr(self, f"{layer_type}_attention_scaling")
+    if layer_type is not None:
+        # transformers>=5.0
+        inv_freq = getattr(self, f"{layer_type}_inv_freq")
+        attention_scaling = getattr(self, f"{layer_type}_attention_scaling")
+    else:
+        # transformers<5.0
+        inv_freq = self.inv_freq
+        attention_scaling = self.attention_scaling

     inv_freq_expanded = (
         inv_freq[None, :, None].float().expand(position_ids.shape[0], -1, 1).to(x.device)

--- original
+++ rewritten
@@ -1,99 +1,193 @@
-def dynamic_rope_update(rope_forward):
-    """
-    Decorator function to update the RoPE parameters in the forward pass, if the model is using a dynamic RoPE
-    (i.e. a RoPE implementation that may recompute its frequencies in the forward pass).
+def patched_dynamic_rope_update(rope_forward):
+    """manual patch: ``[patch:transformers.modeling_rope_utils.dynamic_rope_update]``

-    Args:
-        rope_forward (Callable):
-            The forward pass of the RoPE implementation.
+    ``rope_type`` is determined in the constructor of class
+    :class:`transformers.models.phi3.modeling_phi3.Phi3RotaryEmbedding`.

-    Returns:
-        The decorated forward pass.
+    .. code-block:: python
+
+        if hasattr(config, "rope_scaling") and config.rope_scaling is not None:
+            self.rope_type = config.rope_scaling.get(
+                "rope_type", config.rope_scaling.get("type"))
+        else:
+            self.rope_type = "default"
+
+    The original code of the patched function:
+
+    .. code-block:: python
+
+        def dynamic_rope_update(rope_forward):
+            def longrope_frequency_update(self, position_ids, device):
+                seq_len = torch.max(position_ids) + 1
+                if hasattr(self.config, "original_max_position_embeddings"):
+                    original_max_position_embeddings =
+                        self.config.original_max_position_embeddings
+                else:
+                    original_max_position_embeddings =
+                        self.config.max_position_embeddings
+                if seq_len > original_max_position_embeddings:
+                    if not hasattr(self, "long_inv_freq"):
+                        self.long_inv_freq, _ = self.rope_init_fn(
+                            self.config, device, seq_len=original_max_position_embeddings + 1
+                        )
+                    self.register_buffer("inv_freq", self.long_inv_freq, persistent=False)
+                else:
+                    self.original_inv_freq = self.original_inv_freq.to(device)
+                    self.register_buffer("inv_freq", self.original_inv_freq, persistent=False)
+
+            def dynamic_frequency_update(self, position_ids, device):
+                seq_len = torch.max(position_ids) + 1
+                if seq_len > self.max_seq_len_cached:  # growth
+                    inv_freq, self.attention_scaling = self.rope_init_fn(
+                        self.config, device, seq_len=seq_len)
+                    self.register_buffer("inv_freq", inv_freq, persistent=False)
+                    self.max_seq_len_cached = seq_len
+
+                if seq_len < self.original_max_seq_len and
+                        self.max_seq_len_cached > self.original_max_seq_len:
+                    self.original_inv_freq = self.original_inv_freq.to(device)
+                    self.register_buffer("inv_freq", self.original_inv_freq, persistent=False)
+                    self.max_seq_len_cached = self.original_max_seq_len
+
+            @wraps(rope_forward)
+            def wrapper(self, x, position_ids):
+                if "dynamic" in self.rope_type:
+                    dynamic_frequency_update(self, position_ids, device=x.device)
+                elif self.rope_type == "longrope":
+                    longrope_frequency_update(self, position_ids, device=x.device)
+                return rope_forward(self, x, position_ids)
+
+            return wrapper
+
     """

     def longrope_frequency_update(self, position_ids, device, layer_type=None):
-        """Longrope uses long factor if sequence is larger than original pretraining length, short otherwise."""
+        # It is no use to patch the function after the model is created
+        # as rope_init_fn is an attribute set to one function when the model
+        # is created and when no patch is applied yet.
+        # So we select the patched version here.
+        rope_init_fn = _get_rope_init_fn(self, layer_type=layer_type)
         seq_len = torch.max(position_ids) + 1
-        original_max_position_embeddings = getattr(
-            self.config, "original_max_position_embeddings", self.config.max_position_embeddings
-        )
+        if hasattr(self.config, "original_max_position_embeddings"):
+            original_max_position_embeddings = self.config.original_max_position_embeddings
+        else:
+            original_max_position_embeddings = self.config.max_position_embeddings
+
         if layer_type is None:
-            rope_type = self.rope_type
+            # rope_type = self.rope_type
             original_inv_freq = self.original_inv_freq
             prefix = ""
         else:
-            rope_type = self.rope_type[layer_type]
+            # rope_type = self.rope_type[layer_type]
             original_inv_freq = getattr(self, f"{layer_type}_original_inv_freq")
             prefix = f"{layer_type}_"

-        if seq_len > original_max_position_embeddings:
-            if not hasattr(self, f"{layer_type}_long_inv_freq"):
-                rope_init_fn = ROPE_INIT_FUNCTIONS[rope_type]
-                long_inv_freq, _ = rope_init_fn(
-                    self.config,
-                    device,
-                    seq_len=original_max_position_embeddings + 1,
-                    layer_type=layer_type,
-                )
-            self.register_buffer(f"{prefix}inv_freq", long_inv_freq, persistent=False)
-            setattr(self, f"{prefix}long_inv_freq", long_inv_freq)
-        else:
-            # This .to() is needed if the model has been moved to a device after being initialized (because
-            # the buffer is automatically moved, but not the original copy)
-            original_inv_freq = original_inv_freq.to(device)
-            self.register_buffer(f"{prefix}inv_freq", original_inv_freq, persistent=False)
-            setattr(self, f"{prefix}original_inv_freq", original_inv_freq)
+        # At export time, seq_len is unknown.
+        long_inv_freq, _ = rope_init_fn(
+            self.config, device, seq_len=original_max_position_embeddings + 1
+        )
+        original_inv_freq = self.original_inv_freq.to(device)
+
+        # PATCHED: uses torch.cond instead of a test
+        cond = (seq_len > original_max_position_embeddings).item()
+        inv_freq = torch.cond(
+            cond,
+            (lambda x, y: x.clone()),
+            (lambda x, y: y.clone()),
+            [long_inv_freq, original_inv_freq],
+        )
+        setattr(self, f"{prefix}inv_freq", inv_freq)
+        # if seq_len > original_max_position_embeddings:
+        #    self.inv_freq = self.long_inv_freq
+        # else:
+        #    self.inv_freq = self.original_inv_freq

     def dynamic_frequency_update(self, position_ids, device, layer_type=None):
-        """
-        dynamic RoPE layers should recompute `inv_freq` in the following situations:
-        1 - growing beyond the cached sequence length (allow scaling)
-        2 - the current sequence length is in the original scale (avoid losing precision with small sequences)
-        """
+        # constructor:
+        # - self.max_seq_len_cached = config.max_position_embeddings
+        # - self.original_max_seq_len = config.max_position_embeddings
+        # - inv_freq, self.attention_scaling = self.rope_init_fn(self.config, device)
+
+        # It is no use to patch the function after the model is created
+        # as rope_init_fn is an attribute set to one function when the model
+        # is created and when no patch is applied yet.
+        # So we select the patched version here.
+        rope_init_fn = _get_rope_init_fn(self, layer_type=layer_type)
+
+        # This behaviour is difficult to translate.
+        # The sequence always grows.
+        # The test should always True.
+        # So:  self.max_seq_len_cached = max(self.max_seq_len_cached, seq_len) --> seq_len
+        #
+        # if seq_len > self.max_seq_len_cached:  # growth
+        #    inv_freq, self.attention_scaling = self.rope_init_fn(
+        #        self.config, device, seq_len=seq_len
+        #    )
+        #    self.register_buffer("inv_freq", inv_freq, persistent=False)
+        #    self.max_seq_len_cached = seq_len
+        #
+        # So we should not need what follows.
+        #
+        # cond = (seq_len > self.max_seq_len_cached).item()
+        # self.attention_scaling = torch.cond(
+        #    cond,
+        #    (lambda x, y: x.clone()),
+        #    (lambda x, y: y.clone()),
+        #    [attention_scaling, self.attention_scaling],
+        # )
+
         seq_len = torch.max(position_ids) + 1
+        long_inv_freq, self.attention_scaling = rope_init_fn(self.config, device, seq_len=seq_len)
+
         if layer_type is None:
-            rope_type = self.rope_type
-            max_seq_len_cached = self.max_seq_len_cached
+            # rope_type = self.rope_type
+            # max_seq_len_cached = self.max_seq_len_cached
             original_inv_freq = self.original_inv_freq
             prefix = ""
         else:
-            rope_type = self.rope_type[layer_type]
-            max_seq_len_cached = getattr(
-                self, f"{layer_type}_max_seq_len_cached", self.max_seq_len_cached
-            )
+            # rope_type = self.rope_type[layer_type]
+            # max_seq_len_cached = getattr(
+            #     self, f"{layer_type}_max_seq_len_cached", self.max_seq_len_cached
+            # )
             original_inv_freq = getattr(self, f"{layer_type}_original_inv_freq")
             prefix = f"{layer_type}_"

-        if seq_len > max_seq_len_cached:  # growth
-            rope_init_fn = ROPE_INIT_FUNCTIONS[rope_type]
-            inv_freq, self.attention_scaling = rope_init_fn(
-                self.config,
-                device,
-                seq_len=seq_len,
-                layer_type=layer_type,
-            )
-            # TODO joao: may break with compilation
-            self.register_buffer(f"{prefix}inv_freq", inv_freq, persistent=False)
-            setattr(self, f"{layer_type}_max_seq_len_cached", seq_len)
+        # Second test to translate.
+        # Let's keep in mind, self.max_seq_len_cached = seq_len is likely to be True.
+        # But in that case the following condition is a way to restore the original cache.

-        if (
-            seq_len < self.original_max_seq_len and max_seq_len_cached > self.original_max_seq_len
-        ):  # reset
-            # This .to() is needed if the model has been moved to a device after being initialized (because
-            # the buffer is automatically moved, but not the original copy)
-            original_inv_freq = original_inv_freq.to(device)
-            self.register_buffer(f"{prefix}inv_freq", original_inv_freq, persistent=False)
-            setattr(self, f"{prefix}original_inv_freq", original_inv_freq)
-            setattr(self, f"{layer_type}_max_seq_len_cached", self.original_max_seq_len)
+        # if (
+        #    seq_len < self.original_max_seq_len
+        #    and self.max_seq_len_cached > self.original_max_seq_len
+        # ):
+        #    self.original_inv_freq = self.original_inv_freq.to(device)
+        #    self.register_buffer("inv_freq", self.original_inv_freq, persistent=False)
+        #    self.max_seq_len_cached = self.original_max_seq_len
+
+        original_inv_freq = self.original_inv_freq.to(device)
+        cond = (seq_len >= self.original_max_seq_len).item()
+        # PATCHED: uses torch.cond instead of a test
+        inv_freq = torch.cond(
+            cond,
+            (lambda x, y: x.clone()),
+            (lambda x, y: y.clone()),
+            [long_inv_freq, original_inv_freq],
+        )
+        setattr(self, f"{prefix}inv_freq", inv_freq)

     @wraps(rope_forward)
     def wrapper(self, x, position_ids, layer_type=None):
-        rope_type = self.rope_type if layer_type is None else self.rope_type[layer_type]
-        kwargs = {"layer_type": layer_type} if layer_type is not None else {}
-        if "dynamic" in rope_type:
-            dynamic_frequency_update(self, position_ids, device=x.device, **kwargs)
-        elif rope_type == "longrope":
-            longrope_frequency_update(self, position_ids, device=x.device, **kwargs)
-        return rope_forward(self, x, position_ids, **kwargs)
+        if layer_type is None:
+            if "dynamic" in self.rope_type:
+                dynamic_frequency_update(self, position_ids, device=x.device)
+            elif self.rope_type == "longrope":
+                longrope_frequency_update(self, position_ids, device=x.device)
+            return rope_forward(self, x, position_ids)
+
+        if "dynamic" in self.rope_type:
+            dynamic_frequency_update(self, position_ids, device=x.device, layer_type=layer_type)
+        elif self.rope_type == "longrope":
+            longrope_frequency_update(self, position_ids, device=x.device, layer_type=layer_type)
+        return rope_forward(self, x, position_ids, layer_type=layer_type)

     return wrapper

auto/patch_transformers: GemmaRotaryEmbedding.forward -> common_RotaryEmbedding.forward¶

--- original
+++ rewritten
@@ -1,8 +1,16 @@
-@torch.no_grad()
-@dynamic_rope_update  # power user: used with advanced RoPE types (e.g. dynamic rope)
-def forward(self, x, position_ids):
+@patched_dynamic_rope_update
+def forward(self, x, position_ids, layer_type=None):
+    if layer_type is not None:
+        # transformers>=5.0
+        inv_freq = getattr(self, f"{layer_type}_inv_freq")
+        attention_scaling = getattr(self, f"{layer_type}_attention_scaling")
+    else:
+        # transformers<5.0
+        inv_freq = self.inv_freq
+        attention_scaling = self.attention_scaling
+
     inv_freq_expanded = (
-        self.inv_freq[None, :, None].float().expand(position_ids.shape[0], -1, 1).to(x.device)
+        inv_freq[None, :, None].float().expand(position_ids.shape[0], -1, 1).to(x.device)
     )
     position_ids_expanded = position_ids[:, None, :].float()

@@ -12,7 +20,7 @@
     with torch.autocast(device_type=device_type, enabled=False):  # Force float32
         freqs = (inv_freq_expanded.float() @ position_ids_expanded.float()).transpose(1, 2)
         emb = torch.cat((freqs, freqs), dim=-1)
-        cos = emb.cos() * self.attention_scaling
-        sin = emb.sin() * self.attention_scaling
+        cos = emb.cos() * attention_scaling
+        sin = emb.sin() * attention_scaling

     return cos.to(dtype=x.dtype), sin.to(dtype=x.dtype)

--- original
+++ rewritten
@@ -1,99 +1,193 @@
-def dynamic_rope_update(rope_forward):
-    """
-    Decorator function to update the RoPE parameters in the forward pass, if the model is using a dynamic RoPE
-    (i.e. a RoPE implementation that may recompute its frequencies in the forward pass).
+def patched_dynamic_rope_update(rope_forward):
+    """manual patch: ``[patch:transformers.modeling_rope_utils.dynamic_rope_update]``

-    Args:
-        rope_forward (Callable):
-            The forward pass of the RoPE implementation.
+    ``rope_type`` is determined in the constructor of class
+    :class:`transformers.models.phi3.modeling_phi3.Phi3RotaryEmbedding`.

-    Returns:
-        The decorated forward pass.
+    .. code-block:: python
+
+        if hasattr(config, "rope_scaling") and config.rope_scaling is not None:
+            self.rope_type = config.rope_scaling.get(
+                "rope_type", config.rope_scaling.get("type"))
+        else:
+            self.rope_type = "default"
+
+    The original code of the patched function:
+
+    .. code-block:: python
+
+        def dynamic_rope_update(rope_forward):
+            def longrope_frequency_update(self, position_ids, device):
+                seq_len = torch.max(position_ids) + 1
+                if hasattr(self.config, "original_max_position_embeddings"):
+                    original_max_position_embeddings =
+                        self.config.original_max_position_embeddings
+                else:
+                    original_max_position_embeddings =
+                        self.config.max_position_embeddings
+                if seq_len > original_max_position_embeddings:
+                    if not hasattr(self, "long_inv_freq"):
+                        self.long_inv_freq, _ = self.rope_init_fn(
+                            self.config, device, seq_len=original_max_position_embeddings + 1
+                        )
+                    self.register_buffer("inv_freq", self.long_inv_freq, persistent=False)
+                else:
+                    self.original_inv_freq = self.original_inv_freq.to(device)
+                    self.register_buffer("inv_freq", self.original_inv_freq, persistent=False)
+
+            def dynamic_frequency_update(self, position_ids, device):
+                seq_len = torch.max(position_ids) + 1
+                if seq_len > self.max_seq_len_cached:  # growth
+                    inv_freq, self.attention_scaling = self.rope_init_fn(
+                        self.config, device, seq_len=seq_len)
+                    self.register_buffer("inv_freq", inv_freq, persistent=False)
+                    self.max_seq_len_cached = seq_len
+
+                if seq_len < self.original_max_seq_len and
+                        self.max_seq_len_cached > self.original_max_seq_len:
+                    self.original_inv_freq = self.original_inv_freq.to(device)
+                    self.register_buffer("inv_freq", self.original_inv_freq, persistent=False)
+                    self.max_seq_len_cached = self.original_max_seq_len
+
+            @wraps(rope_forward)
+            def wrapper(self, x, position_ids):
+                if "dynamic" in self.rope_type:
+                    dynamic_frequency_update(self, position_ids, device=x.device)
+                elif self.rope_type == "longrope":
+                    longrope_frequency_update(self, position_ids, device=x.device)
+                return rope_forward(self, x, position_ids)
+
+            return wrapper
+
     """

     def longrope_frequency_update(self, position_ids, device, layer_type=None):
-        """Longrope uses long factor if sequence is larger than original pretraining length, short otherwise."""
+        # It is no use to patch the function after the model is created
+        # as rope_init_fn is an attribute set to one function when the model
+        # is created and when no patch is applied yet.
+        # So we select the patched version here.
+        rope_init_fn = _get_rope_init_fn(self, layer_type=layer_type)
         seq_len = torch.max(position_ids) + 1
-        original_max_position_embeddings = getattr(
-            self.config, "original_max_position_embeddings", self.config.max_position_embeddings
-        )
+        if hasattr(self.config, "original_max_position_embeddings"):
+            original_max_position_embeddings = self.config.original_max_position_embeddings
+        else:
+            original_max_position_embeddings = self.config.max_position_embeddings
+
         if layer_type is None:
-            rope_type = self.rope_type
+            # rope_type = self.rope_type
             original_inv_freq = self.original_inv_freq
             prefix = ""
         else:
-            rope_type = self.rope_type[layer_type]
+            # rope_type = self.rope_type[layer_type]
             original_inv_freq = getattr(self, f"{layer_type}_original_inv_freq")
             prefix = f"{layer_type}_"

-        if seq_len > original_max_position_embeddings:
-            if not hasattr(self, f"{layer_type}_long_inv_freq"):
-                rope_init_fn = ROPE_INIT_FUNCTIONS[rope_type]
-                long_inv_freq, _ = rope_init_fn(
-                    self.config,
-                    device,
-                    seq_len=original_max_position_embeddings + 1,
-                    layer_type=layer_type,
-                )
-            self.register_buffer(f"{prefix}inv_freq", long_inv_freq, persistent=False)
-            setattr(self, f"{prefix}long_inv_freq", long_inv_freq)
-        else:
-            # This .to() is needed if the model has been moved to a device after being initialized (because
-            # the buffer is automatically moved, but not the original copy)
-            original_inv_freq = original_inv_freq.to(device)
-            self.register_buffer(f"{prefix}inv_freq", original_inv_freq, persistent=False)
-            setattr(self, f"{prefix}original_inv_freq", original_inv_freq)
+        # At export time, seq_len is unknown.
+        long_inv_freq, _ = rope_init_fn(
+            self.config, device, seq_len=original_max_position_embeddings + 1
+        )
+        original_inv_freq = self.original_inv_freq.to(device)
+
+        # PATCHED: uses torch.cond instead of a test
+        cond = (seq_len > original_max_position_embeddings).item()
+        inv_freq = torch.cond(
+            cond,
+            (lambda x, y: x.clone()),
+            (lambda x, y: y.clone()),
+            [long_inv_freq, original_inv_freq],
+        )
+        setattr(self, f"{prefix}inv_freq", inv_freq)
+        # if seq_len > original_max_position_embeddings:
+        #    self.inv_freq = self.long_inv_freq
+        # else:
+        #    self.inv_freq = self.original_inv_freq

     def dynamic_frequency_update(self, position_ids, device, layer_type=None):
-        """
-        dynamic RoPE layers should recompute `inv_freq` in the following situations:
-        1 - growing beyond the cached sequence length (allow scaling)
-        2 - the current sequence length is in the original scale (avoid losing precision with small sequences)
-        """
+        # constructor:
+        # - self.max_seq_len_cached = config.max_position_embeddings
+        # - self.original_max_seq_len = config.max_position_embeddings
+        # - inv_freq, self.attention_scaling = self.rope_init_fn(self.config, device)
+
+        # It is no use to patch the function after the model is created
+        # as rope_init_fn is an attribute set to one function when the model
+        # is created and when no patch is applied yet.
+        # So we select the patched version here.
+        rope_init_fn = _get_rope_init_fn(self, layer_type=layer_type)
+
+        # This behaviour is difficult to translate.
+        # The sequence always grows.
+        # The test should always True.
+        # So:  self.max_seq_len_cached = max(self.max_seq_len_cached, seq_len) --> seq_len
+        #
+        # if seq_len > self.max_seq_len_cached:  # growth
+        #    inv_freq, self.attention_scaling = self.rope_init_fn(
+        #        self.config, device, seq_len=seq_len
+        #    )
+        #    self.register_buffer("inv_freq", inv_freq, persistent=False)
+        #    self.max_seq_len_cached = seq_len
+        #
+        # So we should not need what follows.
+        #
+        # cond = (seq_len > self.max_seq_len_cached).item()
+        # self.attention_scaling = torch.cond(
+        #    cond,
+        #    (lambda x, y: x.clone()),
+        #    (lambda x, y: y.clone()),
+        #    [attention_scaling, self.attention_scaling],
+        # )
+
         seq_len = torch.max(position_ids) + 1
+        long_inv_freq, self.attention_scaling = rope_init_fn(self.config, device, seq_len=seq_len)
+
         if layer_type is None:
-            rope_type = self.rope_type
-            max_seq_len_cached = self.max_seq_len_cached
+            # rope_type = self.rope_type
+            # max_seq_len_cached = self.max_seq_len_cached
             original_inv_freq = self.original_inv_freq
             prefix = ""
         else:
-            rope_type = self.rope_type[layer_type]
-            max_seq_len_cached = getattr(
-                self, f"{layer_type}_max_seq_len_cached", self.max_seq_len_cached
-            )
+            # rope_type = self.rope_type[layer_type]
+            # max_seq_len_cached = getattr(
+            #     self, f"{layer_type}_max_seq_len_cached", self.max_seq_len_cached
+            # )
             original_inv_freq = getattr(self, f"{layer_type}_original_inv_freq")
             prefix = f"{layer_type}_"

-        if seq_len > max_seq_len_cached:  # growth
-            rope_init_fn = ROPE_INIT_FUNCTIONS[rope_type]
-            inv_freq, self.attention_scaling = rope_init_fn(
-                self.config,
-                device,
-                seq_len=seq_len,
-                layer_type=layer_type,
-            )
-            # TODO joao: may break with compilation
-            self.register_buffer(f"{prefix}inv_freq", inv_freq, persistent=False)
-            setattr(self, f"{layer_type}_max_seq_len_cached", seq_len)
+        # Second test to translate.
+        # Let's keep in mind, self.max_seq_len_cached = seq_len is likely to be True.
+        # But in that case the following condition is a way to restore the original cache.

-        if (
-            seq_len < self.original_max_seq_len and max_seq_len_cached > self.original_max_seq_len
-        ):  # reset
-            # This .to() is needed if the model has been moved to a device after being initialized (because
-            # the buffer is automatically moved, but not the original copy)
-            original_inv_freq = original_inv_freq.to(device)
-            self.register_buffer(f"{prefix}inv_freq", original_inv_freq, persistent=False)
-            setattr(self, f"{prefix}original_inv_freq", original_inv_freq)
-            setattr(self, f"{layer_type}_max_seq_len_cached", self.original_max_seq_len)
+        # if (
+        #    seq_len < self.original_max_seq_len
+        #    and self.max_seq_len_cached > self.original_max_seq_len
+        # ):
+        #    self.original_inv_freq = self.original_inv_freq.to(device)
+        #    self.register_buffer("inv_freq", self.original_inv_freq, persistent=False)
+        #    self.max_seq_len_cached = self.original_max_seq_len
+
+        original_inv_freq = self.original_inv_freq.to(device)
+        cond = (seq_len >= self.original_max_seq_len).item()
+        # PATCHED: uses torch.cond instead of a test
+        inv_freq = torch.cond(
+            cond,
+            (lambda x, y: x.clone()),
+            (lambda x, y: y.clone()),
+            [long_inv_freq, original_inv_freq],
+        )
+        setattr(self, f"{prefix}inv_freq", inv_freq)

     @wraps(rope_forward)
     def wrapper(self, x, position_ids, layer_type=None):
-        rope_type = self.rope_type if layer_type is None else self.rope_type[layer_type]
-        kwargs = {"layer_type": layer_type} if layer_type is not None else {}
-        if "dynamic" in rope_type:
-            dynamic_frequency_update(self, position_ids, device=x.device, **kwargs)
-        elif rope_type == "longrope":
-            longrope_frequency_update(self, position_ids, device=x.device, **kwargs)
-        return rope_forward(self, x, position_ids, **kwargs)
+        if layer_type is None:
+            if "dynamic" in self.rope_type:
+                dynamic_frequency_update(self, position_ids, device=x.device)
+            elif self.rope_type == "longrope":
+                longrope_frequency_update(self, position_ids, device=x.device)
+            return rope_forward(self, x, position_ids)
+
+        if "dynamic" in self.rope_type:
+            dynamic_frequency_update(self, position_ids, device=x.device, layer_type=layer_type)
+        elif self.rope_type == "longrope":
+            longrope_frequency_update(self, position_ids, device=x.device, layer_type=layer_type)
+        return rope_forward(self, x, position_ids, layer_type=layer_type)

     return wrapper

auto/patch_transformers: GenerationMixin._cache_dependant_input_preparation -> patched_GenerationMixin._cache_dependant_input_preparation¶

--- original
+++ rewritten
@@ -3,23 +3,29 @@
     input_ids: torch.LongTensor,
     inputs_embeds: Optional[torch.FloatTensor],
     cache_position: Optional[torch.LongTensor],
-) -> tuple[torch.FloatTensor, torch.LongTensor]:
+) -> Tuple[torch.FloatTensor, torch.LongTensor]:
     """
     Generic cache-dependent input preparation
     The code is put in a separate function to allow granular unit testing
     as it needs a different implementation to be exportable.

-    If we have cache: let's slice `input_ids` through `cache_position`, to keep only the unprocessed tokens
-    - Exception 1: when passing input_embeds, input_ids may be missing entries
-    - Exception 2: some generation methods do special slicing of input_ids, so we don't need to do it here
-    - Exception 3: with synced GPUs cache_position may go out of bounds, but we only want dummy token in that case.
-    - Exception 4: If input_embeds are passed then slice it through `cache_position`, to keep only the unprocessed tokens and
-      generate the first token for each sequence. Later use the generated Input ids for continuation.
+    If we have cache: let's slice `input_ids` through `cache_position`,
+    to keep only the unprocessed tokens
+    - Exception 1: when passing input_embeds,
+      input_ids may be missing entries
+    - Exception 2: some generation methods do special slicing of input_ids,
+      so we don't need to do it here
+    - Exception 3: with synced GPUs cache_position may go out of bounds,
+      but we only want dummy token in that case.
+    - Exception 4: If input_embeds are passed then slice it through
+      `cache_position`, to keep only the unprocessed tokens and
+      generate the first token for each sequence.
+      Later use the generated Input ids for continuation.

     The current implementation does not rely on ``self`` and could be
     a class method. It is left as a standard method to be easily rewritten.
     """
-    if is_torchdynamo_exporting():
+    if _is_torchdynamo_exporting():
         return self._cache_dependant_input_preparation_exporting(
             input_ids, inputs_embeds, cache_position
         )

auto/patch_transformers: GenerationMixin._cache_dependant_input_preparation_exporting -> patched_GenerationMixin._cache_dependant_input_preparation_exporting¶

--- original
+++ rewritten
@@ -3,7 +3,7 @@
     input_ids: torch.LongTensor,
     inputs_embeds: Optional[torch.FloatTensor],
     cache_position: Optional[torch.LongTensor],
-) -> tuple[torch.FloatTensor, torch.LongTensor]:
+) -> Tuple[torch.FloatTensor, torch.LongTensor]:
     """
     This method implements method ``_cache_dependant_input_preparation``
     with :func:`torch.cond` to make it exportable with :func:`torch.export.export`.
@@ -21,7 +21,6 @@
         #     else:
         #         if input_ids.shape[1] != cache_position.shape[0]:
         #             input_ids = input_ids[:, cache_position]
-        # We need to clone the outputs to avoid aliasing.
         def branch_1(inputs_embeds, cache_position):
             return inputs_embeds[:, -cache_position.shape[0] :].clone()

@@ -49,7 +48,7 @@
                             torch.cond(
                                 input_ids.shape[1] != cache_position.shape[0],
                                 branch_3,
-                                (lambda input_ids, cache_position: input_ids.clone()),
+                                (lambda input_ids, cache_position: input_ids),
                                 [input_ids, cache_position],
                             )
                         ),

auto/patch_transformers: IdeficsAttention.forward -> patched_IdeficsAttention.forward¶

--- original
+++ rewritten
@@ -4,10 +4,12 @@
     key_value_states: Optional[torch.Tensor] = None,
     attention_mask: Optional[torch.Tensor] = None,
     position_ids: Optional[torch.LongTensor] = None,
-    past_key_values: Optional[Cache] = None,
+    past_key_value: Optional[Tuple[torch.Tensor]] = None,
+    output_attentions: bool = False,
+    use_cache: bool = False,
     cache_position: Optional[torch.LongTensor] = None,
-    **kwargs: Unpack[TransformersKwargs],
-) -> tuple[torch.Tensor, torch.Tensor]:
+    **kwargs,
+) -> Tuple[torch.Tensor, Optional[torch.Tensor], Optional[Tuple[torch.Tensor]]]:
     # if key_value_states are provided this layer is used as a cross-attention layer
     is_cross_attention = self.is_cross_attention or key_value_states is not None

@@ -43,20 +45,27 @@
         )

     kv_seq_len = key_states.shape[-2]
-    if past_key_values is not None:
+    if past_key_value is not None:
         kv_seq_len += cache_position[0]

     if not is_cross_attention:
-        cos, sin = self.rotary_emb(value_states, seq_len=max(kv_seq_len, q_len))
-        query_states, key_states = apply_rotary_pos_emb(
-            query_states, key_states, cos, sin, position_ids
+        rotary_length = torch.maximum(
+            torch.tensor(kv_seq_len, dtype=torch.int64),
+            torch.tensor(q_len, dtype=torch.int64),
+        )
+        cos, sin = self.rotary_emb(value_states, seq_len=rotary_length)
+        query_states, key_states = (
+            transformers.models.idefics.modeling_idefics.apply_rotary_pos_emb(
+                query_states, key_states, cos, sin, position_ids
+            )
         )
     # [bsz, nh, t, hd]

-    if past_key_values is not None:
-        # sin and cos are specific to RoPE models; cache_position needed for the static cache
+    if past_key_value is not None:
+        # sin and cos are specific to RoPE models;
+        # cache_position needed for the static cache
         cache_kwargs = {"cache_position": cache_position}
-        key_states, value_states = past_key_values.update(
+        key_states, value_states = past_key_value.update(
             key_states, value_states, self.layer_idx, cache_kwargs
         )

@@ -64,10 +73,22 @@
         query_states = self.q_layer_norm(query_states)
         key_states = self.k_layer_norm(key_states)

-    attention_interface: Callable = eager_attention_forward
+    attention_interface: Callable = (
+        transformers.models.idefics.modeling_idefics.eager_attention_forward
+    )

     if self.config._attn_implementation != "eager":
-        attention_interface = ALL_ATTENTION_FUNCTIONS[self.config._attn_implementation]
+        if self.config._attn_implementation == "sdpa" and output_attentions:
+            transformers.models.idefics.modeling_idefics.logger.warning_once(
+                "`torch.nn.functional.scaled_dot_product_attention` does not support "
+                "`output_attentions=True`. Falling back to "
+                "eager attention. This warning can be removed using the argument "
+                '`attn_implementation="eager"` when loading the model.'
+            )
+        else:
+            attention_interface = transformers.modeling_utils.ALL_ATTENTION_FUNCTIONS[
+                self.config._attn_implementation
+            ]

     attn_output, attn_weights = attention_interface(
         self,
@@ -83,4 +104,9 @@
     attn_output = attn_output.reshape(bsz, q_len, -1).contiguous()
     attn_output = self.o_proj(attn_output)

+    if output_attentions:
+        attn_weights = None
+
+    if pv.Version(transformers.__version__) < pv.Version("4.53.99"):
+        return attn_output, attn_weights, past_key_value
     return attn_output, attn_weights

auto/patch_transformers: IdeficsEmbedding.forward -> patched_IdeficsEmbedding.forward¶

--- original
+++ rewritten
@@ -1,9 +1,26 @@
 def forward(self, x, seq_len=None):
     # x: [bs, num_attention_heads, seq_len, head_size]
-    if seq_len > self.max_seq_len_cached:
-        self._set_cos_sin_cache(seq_len=seq_len, device=x.device, dtype=x.dtype)
+    # if seq_len > self.max_seq_len_cached:
+    #    self._set_cos_sin_cache(seq_len=seq_len, device=x.device, dtype=x.dtype)

-    return (
-        self.cos_cached[:seq_len].to(dtype=x.dtype),
-        self.sin_cached[:seq_len].to(dtype=x.dtype),
+    def _set_cos_sin_cache_then(x, inv_freq, seq_len, _cos_cached, _sin_cached):
+        t = torch.arange(seq_len, device=x.device, dtype=torch.int64).type_as(inv_freq)
+        # freqs = torch.einsum("i,j->ij", t, inv_freq)
+        freqs = t.reshape((-1, 1)) * inv_freq.reshape((1, -1))
+        emb = torch.cat((freqs, freqs), dim=-1)
+        return emb.cos().to(x.dtype), emb.sin().to(x.dtype)
+
+    def _set_cos_sin_cache_else(_x, _inv_freq, _seq_len, cos_cached, sin_cached):
+        torch._check(seq_len.item() <= cos_cached.shape[0])
+        co = cos_cached[: seq_len.item()].detach().clone()
+        torch._check(seq_len.item() <= sin_cached.shape[0])
+        si = sin_cached[: seq_len.item()].detach().clone()
+        return co.to(dtype=x.dtype), si.to(dtype=x.dtype)
+
+    cos_cached, sin_cached = torch.cond(
+        (seq_len > self.max_seq_len_cached).item(),
+        _set_cos_sin_cache_then,
+        _set_cos_sin_cache_else,
+        [x, self.inv_freq, seq_len, self.cos_cached, self.sin_cached],
     )
+    return cos_cached, sin_cached

auto/patch_transformers: LlamaRotaryEmbedding.forward -> common_RotaryEmbedding.forward¶

--- original
+++ rewritten
@@ -1,8 +1,16 @@
-@torch.no_grad()
-@dynamic_rope_update  # power user: used with advanced RoPE types (e.g. dynamic rope)
-def forward(self, x, position_ids):
+@patched_dynamic_rope_update
+def forward(self, x, position_ids, layer_type=None):
+    if layer_type is not None:
+        # transformers>=5.0
+        inv_freq = getattr(self, f"{layer_type}_inv_freq")
+        attention_scaling = getattr(self, f"{layer_type}_attention_scaling")
+    else:
+        # transformers<5.0
+        inv_freq = self.inv_freq
+        attention_scaling = self.attention_scaling
+
     inv_freq_expanded = (
-        self.inv_freq[None, :, None].float().expand(position_ids.shape[0], -1, 1).to(x.device)
+        inv_freq[None, :, None].float().expand(position_ids.shape[0], -1, 1).to(x.device)
     )
     position_ids_expanded = position_ids[:, None, :].float()

@@ -12,7 +20,7 @@
     with torch.autocast(device_type=device_type, enabled=False):  # Force float32
         freqs = (inv_freq_expanded.float() @ position_ids_expanded.float()).transpose(1, 2)
         emb = torch.cat((freqs, freqs), dim=-1)
-        cos = emb.cos() * self.attention_scaling
-        sin = emb.sin() * self.attention_scaling
+        cos = emb.cos() * attention_scaling
+        sin = emb.sin() * attention_scaling

     return cos.to(dtype=x.dtype), sin.to(dtype=x.dtype)

--- original
+++ rewritten
@@ -1,99 +1,193 @@
-def dynamic_rope_update(rope_forward):
-    """
-    Decorator function to update the RoPE parameters in the forward pass, if the model is using a dynamic RoPE
-    (i.e. a RoPE implementation that may recompute its frequencies in the forward pass).
+def patched_dynamic_rope_update(rope_forward):
+    """manual patch: ``[patch:transformers.modeling_rope_utils.dynamic_rope_update]``

-    Args:
-        rope_forward (Callable):
-            The forward pass of the RoPE implementation.
+    ``rope_type`` is determined in the constructor of class
+    :class:`transformers.models.phi3.modeling_phi3.Phi3RotaryEmbedding`.

-    Returns:
-        The decorated forward pass.
+    .. code-block:: python
+
+        if hasattr(config, "rope_scaling") and config.rope_scaling is not None:
+            self.rope_type = config.rope_scaling.get(
+                "rope_type", config.rope_scaling.get("type"))
+        else:
+            self.rope_type = "default"
+
+    The original code of the patched function:
+
+    .. code-block:: python
+
+        def dynamic_rope_update(rope_forward):
+            def longrope_frequency_update(self, position_ids, device):
+                seq_len = torch.max(position_ids) + 1
+                if hasattr(self.config, "original_max_position_embeddings"):
+                    original_max_position_embeddings =
+                        self.config.original_max_position_embeddings
+                else:
+                    original_max_position_embeddings =
+                        self.config.max_position_embeddings
+                if seq_len > original_max_position_embeddings:
+                    if not hasattr(self, "long_inv_freq"):
+                        self.long_inv_freq, _ = self.rope_init_fn(
+                            self.config, device, seq_len=original_max_position_embeddings + 1
+                        )
+                    self.register_buffer("inv_freq", self.long_inv_freq, persistent=False)
+                else:
+                    self.original_inv_freq = self.original_inv_freq.to(device)
+                    self.register_buffer("inv_freq", self.original_inv_freq, persistent=False)
+
+            def dynamic_frequency_update(self, position_ids, device):
+                seq_len = torch.max(position_ids) + 1
+                if seq_len > self.max_seq_len_cached:  # growth
+                    inv_freq, self.attention_scaling = self.rope_init_fn(
+                        self.config, device, seq_len=seq_len)
+                    self.register_buffer("inv_freq", inv_freq, persistent=False)
+                    self.max_seq_len_cached = seq_len
+
+                if seq_len < self.original_max_seq_len and
+                        self.max_seq_len_cached > self.original_max_seq_len:
+                    self.original_inv_freq = self.original_inv_freq.to(device)
+                    self.register_buffer("inv_freq", self.original_inv_freq, persistent=False)
+                    self.max_seq_len_cached = self.original_max_seq_len
+
+            @wraps(rope_forward)
+            def wrapper(self, x, position_ids):
+                if "dynamic" in self.rope_type:
+                    dynamic_frequency_update(self, position_ids, device=x.device)
+                elif self.rope_type == "longrope":
+                    longrope_frequency_update(self, position_ids, device=x.device)
+                return rope_forward(self, x, position_ids)
+
+            return wrapper
+
     """

     def longrope_frequency_update(self, position_ids, device, layer_type=None):
-        """Longrope uses long factor if sequence is larger than original pretraining length, short otherwise."""
+        # It is no use to patch the function after the model is created
+        # as rope_init_fn is an attribute set to one function when the model
+        # is created and when no patch is applied yet.
+        # So we select the patched version here.
+        rope_init_fn = _get_rope_init_fn(self, layer_type=layer_type)
         seq_len = torch.max(position_ids) + 1
-        original_max_position_embeddings = getattr(
-            self.config, "original_max_position_embeddings", self.config.max_position_embeddings
-        )
+        if hasattr(self.config, "original_max_position_embeddings"):
+            original_max_position_embeddings = self.config.original_max_position_embeddings
+        else:
+            original_max_position_embeddings = self.config.max_position_embeddings
+
         if layer_type is None:
-            rope_type = self.rope_type
+            # rope_type = self.rope_type
             original_inv_freq = self.original_inv_freq
             prefix = ""
         else:
-            rope_type = self.rope_type[layer_type]
+            # rope_type = self.rope_type[layer_type]
             original_inv_freq = getattr(self, f"{layer_type}_original_inv_freq")
             prefix = f"{layer_type}_"

-        if seq_len > original_max_position_embeddings:
-            if not hasattr(self, f"{layer_type}_long_inv_freq"):
-                rope_init_fn = ROPE_INIT_FUNCTIONS[rope_type]
-                long_inv_freq, _ = rope_init_fn(
-                    self.config,
-                    device,
-                    seq_len=original_max_position_embeddings + 1,
-                    layer_type=layer_type,
-                )
-            self.register_buffer(f"{prefix}inv_freq", long_inv_freq, persistent=False)
-            setattr(self, f"{prefix}long_inv_freq", long_inv_freq)
-        else:
-            # This .to() is needed if the model has been moved to a device after being initialized (because
-            # the buffer is automatically moved, but not the original copy)
-            original_inv_freq = original_inv_freq.to(device)
-            self.register_buffer(f"{prefix}inv_freq", original_inv_freq, persistent=False)
-            setattr(self, f"{prefix}original_inv_freq", original_inv_freq)
+        # At export time, seq_len is unknown.
+        long_inv_freq, _ = rope_init_fn(
+            self.config, device, seq_len=original_max_position_embeddings + 1
+        )
+        original_inv_freq = self.original_inv_freq.to(device)
+
+        # PATCHED: uses torch.cond instead of a test
+        cond = (seq_len > original_max_position_embeddings).item()
+        inv_freq = torch.cond(
+            cond,
+            (lambda x, y: x.clone()),
+            (lambda x, y: y.clone()),
+            [long_inv_freq, original_inv_freq],
+        )
+        setattr(self, f"{prefix}inv_freq", inv_freq)
+        # if seq_len > original_max_position_embeddings:
+        #    self.inv_freq = self.long_inv_freq
+        # else:
+        #    self.inv_freq = self.original_inv_freq

     def dynamic_frequency_update(self, position_ids, device, layer_type=None):
-        """
-        dynamic RoPE layers should recompute `inv_freq` in the following situations:
-        1 - growing beyond the cached sequence length (allow scaling)
-        2 - the current sequence length is in the original scale (avoid losing precision with small sequences)
-        """
+        # constructor:
+        # - self.max_seq_len_cached = config.max_position_embeddings
+        # - self.original_max_seq_len = config.max_position_embeddings
+        # - inv_freq, self.attention_scaling = self.rope_init_fn(self.config, device)
+
+        # It is no use to patch the function after the model is created
+        # as rope_init_fn is an attribute set to one function when the model
+        # is created and when no patch is applied yet.
+        # So we select the patched version here.
+        rope_init_fn = _get_rope_init_fn(self, layer_type=layer_type)
+
+        # This behaviour is difficult to translate.
+        # The sequence always grows.
+        # The test should always True.
+        # So:  self.max_seq_len_cached = max(self.max_seq_len_cached, seq_len) --> seq_len
+        #
+        # if seq_len > self.max_seq_len_cached:  # growth
+        #    inv_freq, self.attention_scaling = self.rope_init_fn(
+        #        self.config, device, seq_len=seq_len
+        #    )
+        #    self.register_buffer("inv_freq", inv_freq, persistent=False)
+        #    self.max_seq_len_cached = seq_len
+        #
+        # So we should not need what follows.
+        #
+        # cond = (seq_len > self.max_seq_len_cached).item()
+        # self.attention_scaling = torch.cond(
+        #    cond,
+        #    (lambda x, y: x.clone()),
+        #    (lambda x, y: y.clone()),
+        #    [attention_scaling, self.attention_scaling],
+        # )
+
         seq_len = torch.max(position_ids) + 1
+        long_inv_freq, self.attention_scaling = rope_init_fn(self.config, device, seq_len=seq_len)
+
         if layer_type is None:
-            rope_type = self.rope_type
-            max_seq_len_cached = self.max_seq_len_cached
+            # rope_type = self.rope_type
+            # max_seq_len_cached = self.max_seq_len_cached
             original_inv_freq = self.original_inv_freq
             prefix = ""
         else:
-            rope_type = self.rope_type[layer_type]
-            max_seq_len_cached = getattr(
-                self, f"{layer_type}_max_seq_len_cached", self.max_seq_len_cached
-            )
+            # rope_type = self.rope_type[layer_type]
+            # max_seq_len_cached = getattr(
+            #     self, f"{layer_type}_max_seq_len_cached", self.max_seq_len_cached
+            # )
             original_inv_freq = getattr(self, f"{layer_type}_original_inv_freq")
             prefix = f"{layer_type}_"

-        if seq_len > max_seq_len_cached:  # growth
-            rope_init_fn = ROPE_INIT_FUNCTIONS[rope_type]
-            inv_freq, self.attention_scaling = rope_init_fn(
-                self.config,
-                device,
-                seq_len=seq_len,
-                layer_type=layer_type,
-            )
-            # TODO joao: may break with compilation
-            self.register_buffer(f"{prefix}inv_freq", inv_freq, persistent=False)
-            setattr(self, f"{layer_type}_max_seq_len_cached", seq_len)
+        # Second test to translate.
+        # Let's keep in mind, self.max_seq_len_cached = seq_len is likely to be True.
+        # But in that case the following condition is a way to restore the original cache.

-        if (
-            seq_len < self.original_max_seq_len and max_seq_len_cached > self.original_max_seq_len
-        ):  # reset
-            # This .to() is needed if the model has been moved to a device after being initialized (because
-            # the buffer is automatically moved, but not the original copy)
-            original_inv_freq = original_inv_freq.to(device)
-            self.register_buffer(f"{prefix}inv_freq", original_inv_freq, persistent=False)
-            setattr(self, f"{prefix}original_inv_freq", original_inv_freq)
-            setattr(self, f"{layer_type}_max_seq_len_cached", self.original_max_seq_len)
+        # if (
+        #    seq_len < self.original_max_seq_len
+        #    and self.max_seq_len_cached > self.original_max_seq_len
+        # ):
+        #    self.original_inv_freq = self.original_inv_freq.to(device)
+        #    self.register_buffer("inv_freq", self.original_inv_freq, persistent=False)
+        #    self.max_seq_len_cached = self.original_max_seq_len
+
+        original_inv_freq = self.original_inv_freq.to(device)
+        cond = (seq_len >= self.original_max_seq_len).item()
+        # PATCHED: uses torch.cond instead of a test
+        inv_freq = torch.cond(
+            cond,
+            (lambda x, y: x.clone()),
+            (lambda x, y: y.clone()),
+            [long_inv_freq, original_inv_freq],
+        )
+        setattr(self, f"{prefix}inv_freq", inv_freq)

     @wraps(rope_forward)
     def wrapper(self, x, position_ids, layer_type=None):
-        rope_type = self.rope_type if layer_type is None else self.rope_type[layer_type]
-        kwargs = {"layer_type": layer_type} if layer_type is not None else {}
-        if "dynamic" in rope_type:
-            dynamic_frequency_update(self, position_ids, device=x.device, **kwargs)
-        elif rope_type == "longrope":
-            longrope_frequency_update(self, position_ids, device=x.device, **kwargs)
-        return rope_forward(self, x, position_ids, **kwargs)
+        if layer_type is None:
+            if "dynamic" in self.rope_type:
+                dynamic_frequency_update(self, position_ids, device=x.device)
+            elif self.rope_type == "longrope":
+                longrope_frequency_update(self, position_ids, device=x.device)
+            return rope_forward(self, x, position_ids)
+
+        if "dynamic" in self.rope_type:
+            dynamic_frequency_update(self, position_ids, device=x.device, layer_type=layer_type)
+        elif self.rope_type == "longrope":
+            longrope_frequency_update(self, position_ids, device=x.device, layer_type=layer_type)
+        return rope_forward(self, x, position_ids, layer_type=layer_type)

     return wrapper

auto/patch_transformers: MistralRotaryEmbedding.forward -> common_RotaryEmbedding.forward¶

--- original
+++ rewritten
@@ -1,8 +1,16 @@
-@torch.no_grad()
-@dynamic_rope_update  # power user: used with advanced RoPE types (e.g. dynamic rope)
-def forward(self, x, position_ids):
+@patched_dynamic_rope_update
+def forward(self, x, position_ids, layer_type=None):
+    if layer_type is not None:
+        # transformers>=5.0
+        inv_freq = getattr(self, f"{layer_type}_inv_freq")
+        attention_scaling = getattr(self, f"{layer_type}_attention_scaling")
+    else:
+        # transformers<5.0
+        inv_freq = self.inv_freq
+        attention_scaling = self.attention_scaling
+
     inv_freq_expanded = (
-        self.inv_freq[None, :, None].float().expand(position_ids.shape[0], -1, 1).to(x.device)
+        inv_freq[None, :, None].float().expand(position_ids.shape[0], -1, 1).to(x.device)
     )
     position_ids_expanded = position_ids[:, None, :].float()

@@ -12,7 +20,7 @@
     with torch.autocast(device_type=device_type, enabled=False):  # Force float32
         freqs = (inv_freq_expanded.float() @ position_ids_expanded.float()).transpose(1, 2)
         emb = torch.cat((freqs, freqs), dim=-1)
-        cos = emb.cos() * self.attention_scaling
-        sin = emb.sin() * self.attention_scaling
+        cos = emb.cos() * attention_scaling
+        sin = emb.sin() * attention_scaling

     return cos.to(dtype=x.dtype), sin.to(dtype=x.dtype)

--- original
+++ rewritten
@@ -1,99 +1,193 @@
-def dynamic_rope_update(rope_forward):
-    """
-    Decorator function to update the RoPE parameters in the forward pass, if the model is using a dynamic RoPE
-    (i.e. a RoPE implementation that may recompute its frequencies in the forward pass).
+def patched_dynamic_rope_update(rope_forward):
+    """manual patch: ``[patch:transformers.modeling_rope_utils.dynamic_rope_update]``

-    Args:
-        rope_forward (Callable):
-            The forward pass of the RoPE implementation.
+    ``rope_type`` is determined in the constructor of class
+    :class:`transformers.models.phi3.modeling_phi3.Phi3RotaryEmbedding`.

-    Returns:
-        The decorated forward pass.
+    .. code-block:: python
+
+        if hasattr(config, "rope_scaling") and config.rope_scaling is not None:
+            self.rope_type = config.rope_scaling.get(
+                "rope_type", config.rope_scaling.get("type"))
+        else:
+            self.rope_type = "default"
+
+    The original code of the patched function:
+
+    .. code-block:: python
+
+        def dynamic_rope_update(rope_forward):
+            def longrope_frequency_update(self, position_ids, device):
+                seq_len = torch.max(position_ids) + 1
+                if hasattr(self.config, "original_max_position_embeddings"):
+                    original_max_position_embeddings =
+                        self.config.original_max_position_embeddings
+                else:
+                    original_max_position_embeddings =
+                        self.config.max_position_embeddings
+                if seq_len > original_max_position_embeddings:
+                    if not hasattr(self, "long_inv_freq"):
+                        self.long_inv_freq, _ = self.rope_init_fn(
+                            self.config, device, seq_len=original_max_position_embeddings + 1
+                        )
+                    self.register_buffer("inv_freq", self.long_inv_freq, persistent=False)
+                else:
+                    self.original_inv_freq = self.original_inv_freq.to(device)
+                    self.register_buffer("inv_freq", self.original_inv_freq, persistent=False)
+
+            def dynamic_frequency_update(self, position_ids, device):
+                seq_len = torch.max(position_ids) + 1
+                if seq_len > self.max_seq_len_cached:  # growth
+                    inv_freq, self.attention_scaling = self.rope_init_fn(
+                        self.config, device, seq_len=seq_len)
+                    self.register_buffer("inv_freq", inv_freq, persistent=False)
+                    self.max_seq_len_cached = seq_len
+
+                if seq_len < self.original_max_seq_len and
+                        self.max_seq_len_cached > self.original_max_seq_len:
+                    self.original_inv_freq = self.original_inv_freq.to(device)
+                    self.register_buffer("inv_freq", self.original_inv_freq, persistent=False)
+                    self.max_seq_len_cached = self.original_max_seq_len
+
+            @wraps(rope_forward)
+            def wrapper(self, x, position_ids):
+                if "dynamic" in self.rope_type:
+                    dynamic_frequency_update(self, position_ids, device=x.device)
+                elif self.rope_type == "longrope":
+                    longrope_frequency_update(self, position_ids, device=x.device)
+                return rope_forward(self, x, position_ids)
+
+            return wrapper
+
     """

     def longrope_frequency_update(self, position_ids, device, layer_type=None):
-        """Longrope uses long factor if sequence is larger than original pretraining length, short otherwise."""
+        # It is no use to patch the function after the model is created
+        # as rope_init_fn is an attribute set to one function when the model
+        # is created and when no patch is applied yet.
+        # So we select the patched version here.
+        rope_init_fn = _get_rope_init_fn(self, layer_type=layer_type)
         seq_len = torch.max(position_ids) + 1
-        original_max_position_embeddings = getattr(
-            self.config, "original_max_position_embeddings", self.config.max_position_embeddings
-        )
+        if hasattr(self.config, "original_max_position_embeddings"):
+            original_max_position_embeddings = self.config.original_max_position_embeddings
+        else:
+            original_max_position_embeddings = self.config.max_position_embeddings
+
         if layer_type is None:
-            rope_type = self.rope_type
+            # rope_type = self.rope_type
             original_inv_freq = self.original_inv_freq
             prefix = ""
         else:
-            rope_type = self.rope_type[layer_type]
+            # rope_type = self.rope_type[layer_type]
             original_inv_freq = getattr(self, f"{layer_type}_original_inv_freq")
             prefix = f"{layer_type}_"

-        if seq_len > original_max_position_embeddings:
-            if not hasattr(self, f"{layer_type}_long_inv_freq"):
-                rope_init_fn = ROPE_INIT_FUNCTIONS[rope_type]
-                long_inv_freq, _ = rope_init_fn(
-                    self.config,
-                    device,
-                    seq_len=original_max_position_embeddings + 1,
-                    layer_type=layer_type,
-                )
-            self.register_buffer(f"{prefix}inv_freq", long_inv_freq, persistent=False)
-            setattr(self, f"{prefix}long_inv_freq", long_inv_freq)
-        else:
-            # This .to() is needed if the model has been moved to a device after being initialized (because
-            # the buffer is automatically moved, but not the original copy)
-            original_inv_freq = original_inv_freq.to(device)
-            self.register_buffer(f"{prefix}inv_freq", original_inv_freq, persistent=False)
-            setattr(self, f"{prefix}original_inv_freq", original_inv_freq)
+        # At export time, seq_len is unknown.
+        long_inv_freq, _ = rope_init_fn(
+            self.config, device, seq_len=original_max_position_embeddings + 1
+        )
+        original_inv_freq = self.original_inv_freq.to(device)
+
+        # PATCHED: uses torch.cond instead of a test
+        cond = (seq_len > original_max_position_embeddings).item()
+        inv_freq = torch.cond(
+            cond,
+            (lambda x, y: x.clone()),
+            (lambda x, y: y.clone()),
+            [long_inv_freq, original_inv_freq],
+        )
+        setattr(self, f"{prefix}inv_freq", inv_freq)
+        # if seq_len > original_max_position_embeddings:
+        #    self.inv_freq = self.long_inv_freq
+        # else:
+        #    self.inv_freq = self.original_inv_freq

     def dynamic_frequency_update(self, position_ids, device, layer_type=None):
-        """
-        dynamic RoPE layers should recompute `inv_freq` in the following situations:
-        1 - growing beyond the cached sequence length (allow scaling)
-        2 - the current sequence length is in the original scale (avoid losing precision with small sequences)
-        """
+        # constructor:
+        # - self.max_seq_len_cached = config.max_position_embeddings
+        # - self.original_max_seq_len = config.max_position_embeddings
+        # - inv_freq, self.attention_scaling = self.rope_init_fn(self.config, device)
+
+        # It is no use to patch the function after the model is created
+        # as rope_init_fn is an attribute set to one function when the model
+        # is created and when no patch is applied yet.
+        # So we select the patched version here.
+        rope_init_fn = _get_rope_init_fn(self, layer_type=layer_type)
+
+        # This behaviour is difficult to translate.
+        # The sequence always grows.
+        # The test should always True.
+        # So:  self.max_seq_len_cached = max(self.max_seq_len_cached, seq_len) --> seq_len
+        #
+        # if seq_len > self.max_seq_len_cached:  # growth
+        #    inv_freq, self.attention_scaling = self.rope_init_fn(
+        #        self.config, device, seq_len=seq_len
+        #    )
+        #    self.register_buffer("inv_freq", inv_freq, persistent=False)
+        #    self.max_seq_len_cached = seq_len
+        #
+        # So we should not need what follows.
+        #
+        # cond = (seq_len > self.max_seq_len_cached).item()
+        # self.attention_scaling = torch.cond(
+        #    cond,
+        #    (lambda x, y: x.clone()),
+        #    (lambda x, y: y.clone()),
+        #    [attention_scaling, self.attention_scaling],
+        # )
+
         seq_len = torch.max(position_ids) + 1
+        long_inv_freq, self.attention_scaling = rope_init_fn(self.config, device, seq_len=seq_len)
+
         if layer_type is None:
-            rope_type = self.rope_type
-            max_seq_len_cached = self.max_seq_len_cached
+            # rope_type = self.rope_type
+            # max_seq_len_cached = self.max_seq_len_cached
             original_inv_freq = self.original_inv_freq
             prefix = ""
         else:
-            rope_type = self.rope_type[layer_type]
-            max_seq_len_cached = getattr(
-                self, f"{layer_type}_max_seq_len_cached", self.max_seq_len_cached
-            )
+            # rope_type = self.rope_type[layer_type]
+            # max_seq_len_cached = getattr(
+            #     self, f"{layer_type}_max_seq_len_cached", self.max_seq_len_cached
+            # )
             original_inv_freq = getattr(self, f"{layer_type}_original_inv_freq")
             prefix = f"{layer_type}_"

-        if seq_len > max_seq_len_cached:  # growth
-            rope_init_fn = ROPE_INIT_FUNCTIONS[rope_type]
-            inv_freq, self.attention_scaling = rope_init_fn(
-                self.config,
-                device,
-                seq_len=seq_len,
-                layer_type=layer_type,
-            )
-            # TODO joao: may break with compilation
-            self.register_buffer(f"{prefix}inv_freq", inv_freq, persistent=False)
-            setattr(self, f"{layer_type}_max_seq_len_cached", seq_len)
+        # Second test to translate.
+        # Let's keep in mind, self.max_seq_len_cached = seq_len is likely to be True.
+        # But in that case the following condition is a way to restore the original cache.

-        if (
-            seq_len < self.original_max_seq_len and max_seq_len_cached > self.original_max_seq_len
-        ):  # reset
-            # This .to() is needed if the model has been moved to a device after being initialized (because
-            # the buffer is automatically moved, but not the original copy)
-            original_inv_freq = original_inv_freq.to(device)
-            self.register_buffer(f"{prefix}inv_freq", original_inv_freq, persistent=False)
-            setattr(self, f"{prefix}original_inv_freq", original_inv_freq)
-            setattr(self, f"{layer_type}_max_seq_len_cached", self.original_max_seq_len)
+        # if (
+        #    seq_len < self.original_max_seq_len
+        #    and self.max_seq_len_cached > self.original_max_seq_len
+        # ):
+        #    self.original_inv_freq = self.original_inv_freq.to(device)
+        #    self.register_buffer("inv_freq", self.original_inv_freq, persistent=False)
+        #    self.max_seq_len_cached = self.original_max_seq_len
+
+        original_inv_freq = self.original_inv_freq.to(device)
+        cond = (seq_len >= self.original_max_seq_len).item()
+        # PATCHED: uses torch.cond instead of a test
+        inv_freq = torch.cond(
+            cond,
+            (lambda x, y: x.clone()),
+            (lambda x, y: y.clone()),
+            [long_inv_freq, original_inv_freq],
+        )
+        setattr(self, f"{prefix}inv_freq", inv_freq)

     @wraps(rope_forward)
     def wrapper(self, x, position_ids, layer_type=None):
-        rope_type = self.rope_type if layer_type is None else self.rope_type[layer_type]
-        kwargs = {"layer_type": layer_type} if layer_type is not None else {}
-        if "dynamic" in rope_type:
-            dynamic_frequency_update(self, position_ids, device=x.device, **kwargs)
-        elif rope_type == "longrope":
-            longrope_frequency_update(self, position_ids, device=x.device, **kwargs)
-        return rope_forward(self, x, position_ids, **kwargs)
+        if layer_type is None:
+            if "dynamic" in self.rope_type:
+                dynamic_frequency_update(self, position_ids, device=x.device)
+            elif self.rope_type == "longrope":
+                longrope_frequency_update(self, position_ids, device=x.device)
+            return rope_forward(self, x, position_ids)
+
+        if "dynamic" in self.rope_type:
+            dynamic_frequency_update(self, position_ids, device=x.device, layer_type=layer_type)
+        elif self.rope_type == "longrope":
+            longrope_frequency_update(self, position_ids, device=x.device, layer_type=layer_type)
+        return rope_forward(self, x, position_ids, layer_type=layer_type)

     return wrapper

auto/patch_transformers: MixtralRotaryEmbedding.forward -> common_RotaryEmbedding.forward¶

--- original
+++ rewritten
@@ -1,8 +1,16 @@
-@torch.no_grad()
-@dynamic_rope_update  # power user: used with advanced RoPE types (e.g. dynamic rope)
-def forward(self, x, position_ids):
+@patched_dynamic_rope_update
+def forward(self, x, position_ids, layer_type=None):
+    if layer_type is not None:
+        # transformers>=5.0
+        inv_freq = getattr(self, f"{layer_type}_inv_freq")
+        attention_scaling = getattr(self, f"{layer_type}_attention_scaling")
+    else:
+        # transformers<5.0
+        inv_freq = self.inv_freq
+        attention_scaling = self.attention_scaling
+
     inv_freq_expanded = (
-        self.inv_freq[None, :, None].float().expand(position_ids.shape[0], -1, 1).to(x.device)
+        inv_freq[None, :, None].float().expand(position_ids.shape[0], -1, 1).to(x.device)
     )
     position_ids_expanded = position_ids[:, None, :].float()

@@ -12,7 +20,7 @@
     with torch.autocast(device_type=device_type, enabled=False):  # Force float32
         freqs = (inv_freq_expanded.float() @ position_ids_expanded.float()).transpose(1, 2)
         emb = torch.cat((freqs, freqs), dim=-1)
-        cos = emb.cos() * self.attention_scaling
-        sin = emb.sin() * self.attention_scaling
+        cos = emb.cos() * attention_scaling
+        sin = emb.sin() * attention_scaling

     return cos.to(dtype=x.dtype), sin.to(dtype=x.dtype)

--- original
+++ rewritten
@@ -1,99 +1,193 @@
-def dynamic_rope_update(rope_forward):
-    """
-    Decorator function to update the RoPE parameters in the forward pass, if the model is using a dynamic RoPE
-    (i.e. a RoPE implementation that may recompute its frequencies in the forward pass).
+def patched_dynamic_rope_update(rope_forward):
+    """manual patch: ``[patch:transformers.modeling_rope_utils.dynamic_rope_update]``

-    Args:
-        rope_forward (Callable):
-            The forward pass of the RoPE implementation.
+    ``rope_type`` is determined in the constructor of class
+    :class:`transformers.models.phi3.modeling_phi3.Phi3RotaryEmbedding`.

-    Returns:
-        The decorated forward pass.
+    .. code-block:: python
+
+        if hasattr(config, "rope_scaling") and config.rope_scaling is not None:
+            self.rope_type = config.rope_scaling.get(
+                "rope_type", config.rope_scaling.get("type"))
+        else:
+            self.rope_type = "default"
+
+    The original code of the patched function:
+
+    .. code-block:: python
+
+        def dynamic_rope_update(rope_forward):
+            def longrope_frequency_update(self, position_ids, device):
+                seq_len = torch.max(position_ids) + 1
+                if hasattr(self.config, "original_max_position_embeddings"):
+                    original_max_position_embeddings =
+                        self.config.original_max_position_embeddings
+                else:
+                    original_max_position_embeddings =
+                        self.config.max_position_embeddings
+                if seq_len > original_max_position_embeddings:
+                    if not hasattr(self, "long_inv_freq"):
+                        self.long_inv_freq, _ = self.rope_init_fn(
+                            self.config, device, seq_len=original_max_position_embeddings + 1
+                        )
+                    self.register_buffer("inv_freq", self.long_inv_freq, persistent=False)
+                else:
+                    self.original_inv_freq = self.original_inv_freq.to(device)
+                    self.register_buffer("inv_freq", self.original_inv_freq, persistent=False)
+
+            def dynamic_frequency_update(self, position_ids, device):
+                seq_len = torch.max(position_ids) + 1
+                if seq_len > self.max_seq_len_cached:  # growth
+                    inv_freq, self.attention_scaling = self.rope_init_fn(
+                        self.config, device, seq_len=seq_len)
+                    self.register_buffer("inv_freq", inv_freq, persistent=False)
+                    self.max_seq_len_cached = seq_len
+
+                if seq_len < self.original_max_seq_len and
+                        self.max_seq_len_cached > self.original_max_seq_len:
+                    self.original_inv_freq = self.original_inv_freq.to(device)
+                    self.register_buffer("inv_freq", self.original_inv_freq, persistent=False)
+                    self.max_seq_len_cached = self.original_max_seq_len
+
+            @wraps(rope_forward)
+            def wrapper(self, x, position_ids):
+                if "dynamic" in self.rope_type:
+                    dynamic_frequency_update(self, position_ids, device=x.device)
+                elif self.rope_type == "longrope":
+                    longrope_frequency_update(self, position_ids, device=x.device)
+                return rope_forward(self, x, position_ids)
+
+            return wrapper
+
     """

     def longrope_frequency_update(self, position_ids, device, layer_type=None):
-        """Longrope uses long factor if sequence is larger than original pretraining length, short otherwise."""
+        # It is no use to patch the function after the model is created
+        # as rope_init_fn is an attribute set to one function when the model
+        # is created and when no patch is applied yet.
+        # So we select the patched version here.
+        rope_init_fn = _get_rope_init_fn(self, layer_type=layer_type)
         seq_len = torch.max(position_ids) + 1
-        original_max_position_embeddings = getattr(
-            self.config, "original_max_position_embeddings", self.config.max_position_embeddings
-        )
+        if hasattr(self.config, "original_max_position_embeddings"):
+            original_max_position_embeddings = self.config.original_max_position_embeddings
+        else:
+            original_max_position_embeddings = self.config.max_position_embeddings
+
         if layer_type is None:
-            rope_type = self.rope_type
+            # rope_type = self.rope_type
             original_inv_freq = self.original_inv_freq
             prefix = ""
         else:
-            rope_type = self.rope_type[layer_type]
+            # rope_type = self.rope_type[layer_type]
             original_inv_freq = getattr(self, f"{layer_type}_original_inv_freq")
             prefix = f"{layer_type}_"

-        if seq_len > original_max_position_embeddings:
-            if not hasattr(self, f"{layer_type}_long_inv_freq"):
-                rope_init_fn = ROPE_INIT_FUNCTIONS[rope_type]
-                long_inv_freq, _ = rope_init_fn(
-                    self.config,
-                    device,
-                    seq_len=original_max_position_embeddings + 1,
-                    layer_type=layer_type,
-                )
-            self.register_buffer(f"{prefix}inv_freq", long_inv_freq, persistent=False)
-            setattr(self, f"{prefix}long_inv_freq", long_inv_freq)
-        else:
-            # This .to() is needed if the model has been moved to a device after being initialized (because
-            # the buffer is automatically moved, but not the original copy)
-            original_inv_freq = original_inv_freq.to(device)
-            self.register_buffer(f"{prefix}inv_freq", original_inv_freq, persistent=False)
-            setattr(self, f"{prefix}original_inv_freq", original_inv_freq)
+        # At export time, seq_len is unknown.
+        long_inv_freq, _ = rope_init_fn(
+            self.config, device, seq_len=original_max_position_embeddings + 1
+        )
+        original_inv_freq = self.original_inv_freq.to(device)
+
+        # PATCHED: uses torch.cond instead of a test
+        cond = (seq_len > original_max_position_embeddings).item()
+        inv_freq = torch.cond(
+            cond,
+            (lambda x, y: x.clone()),
+            (lambda x, y: y.clone()),
+            [long_inv_freq, original_inv_freq],
+        )
+        setattr(self, f"{prefix}inv_freq", inv_freq)
+        # if seq_len > original_max_position_embeddings:
+        #    self.inv_freq = self.long_inv_freq
+        # else:
+        #    self.inv_freq = self.original_inv_freq

     def dynamic_frequency_update(self, position_ids, device, layer_type=None):
-        """
-        dynamic RoPE layers should recompute `inv_freq` in the following situations:
-        1 - growing beyond the cached sequence length (allow scaling)
-        2 - the current sequence length is in the original scale (avoid losing precision with small sequences)
-        """
+        # constructor:
+        # - self.max_seq_len_cached = config.max_position_embeddings
+        # - self.original_max_seq_len = config.max_position_embeddings
+        # - inv_freq, self.attention_scaling = self.rope_init_fn(self.config, device)
+
+        # It is no use to patch the function after the model is created
+        # as rope_init_fn is an attribute set to one function when the model
+        # is created and when no patch is applied yet.
+        # So we select the patched version here.
+        rope_init_fn = _get_rope_init_fn(self, layer_type=layer_type)
+
+        # This behaviour is difficult to translate.
+        # The sequence always grows.
+        # The test should always True.
+        # So:  self.max_seq_len_cached = max(self.max_seq_len_cached, seq_len) --> seq_len
+        #
+        # if seq_len > self.max_seq_len_cached:  # growth
+        #    inv_freq, self.attention_scaling = self.rope_init_fn(
+        #        self.config, device, seq_len=seq_len
+        #    )
+        #    self.register_buffer("inv_freq", inv_freq, persistent=False)
+        #    self.max_seq_len_cached = seq_len
+        #
+        # So we should not need what follows.
+        #
+        # cond = (seq_len > self.max_seq_len_cached).item()
+        # self.attention_scaling = torch.cond(
+        #    cond,
+        #    (lambda x, y: x.clone()),
+        #    (lambda x, y: y.clone()),
+        #    [attention_scaling, self.attention_scaling],
+        # )
+
         seq_len = torch.max(position_ids) + 1
+        long_inv_freq, self.attention_scaling = rope_init_fn(self.config, device, seq_len=seq_len)
+
         if layer_type is None:
-            rope_type = self.rope_type
-            max_seq_len_cached = self.max_seq_len_cached
+            # rope_type = self.rope_type
+            # max_seq_len_cached = self.max_seq_len_cached
             original_inv_freq = self.original_inv_freq
             prefix = ""
         else:
-            rope_type = self.rope_type[layer_type]
-            max_seq_len_cached = getattr(
-                self, f"{layer_type}_max_seq_len_cached", self.max_seq_len_cached
-            )
+            # rope_type = self.rope_type[layer_type]
+            # max_seq_len_cached = getattr(
+            #     self, f"{layer_type}_max_seq_len_cached", self.max_seq_len_cached
+            # )
             original_inv_freq = getattr(self, f"{layer_type}_original_inv_freq")
             prefix = f"{layer_type}_"

-        if seq_len > max_seq_len_cached:  # growth
-            rope_init_fn = ROPE_INIT_FUNCTIONS[rope_type]
-            inv_freq, self.attention_scaling = rope_init_fn(
-                self.config,
-                device,
-                seq_len=seq_len,
-                layer_type=layer_type,
-            )
-            # TODO joao: may break with compilation
-            self.register_buffer(f"{prefix}inv_freq", inv_freq, persistent=False)
-            setattr(self, f"{layer_type}_max_seq_len_cached", seq_len)
+        # Second test to translate.
+        # Let's keep in mind, self.max_seq_len_cached = seq_len is likely to be True.
+        # But in that case the following condition is a way to restore the original cache.

-        if (
-            seq_len < self.original_max_seq_len and max_seq_len_cached > self.original_max_seq_len
-        ):  # reset
-            # This .to() is needed if the model has been moved to a device after being initialized (because
-            # the buffer is automatically moved, but not the original copy)
-            original_inv_freq = original_inv_freq.to(device)
-            self.register_buffer(f"{prefix}inv_freq", original_inv_freq, persistent=False)
-            setattr(self, f"{prefix}original_inv_freq", original_inv_freq)
-            setattr(self, f"{layer_type}_max_seq_len_cached", self.original_max_seq_len)
+        # if (
+        #    seq_len < self.original_max_seq_len
+        #    and self.max_seq_len_cached > self.original_max_seq_len
+        # ):
+        #    self.original_inv_freq = self.original_inv_freq.to(device)
+        #    self.register_buffer("inv_freq", self.original_inv_freq, persistent=False)
+        #    self.max_seq_len_cached = self.original_max_seq_len
+
+        original_inv_freq = self.original_inv_freq.to(device)
+        cond = (seq_len >= self.original_max_seq_len).item()
+        # PATCHED: uses torch.cond instead of a test
+        inv_freq = torch.cond(
+            cond,
+            (lambda x, y: x.clone()),
+            (lambda x, y: y.clone()),
+            [long_inv_freq, original_inv_freq],
+        )
+        setattr(self, f"{prefix}inv_freq", inv_freq)

     @wraps(rope_forward)
     def wrapper(self, x, position_ids, layer_type=None):
-        rope_type = self.rope_type if layer_type is None else self.rope_type[layer_type]
-        kwargs = {"layer_type": layer_type} if layer_type is not None else {}
-        if "dynamic" in rope_type:
-            dynamic_frequency_update(self, position_ids, device=x.device, **kwargs)
-        elif rope_type == "longrope":
-            longrope_frequency_update(self, position_ids, device=x.device, **kwargs)
-        return rope_forward(self, x, position_ids, **kwargs)
+        if layer_type is None:
+            if "dynamic" in self.rope_type:
+                dynamic_frequency_update(self, position_ids, device=x.device)
+            elif self.rope_type == "longrope":
+                longrope_frequency_update(self, position_ids, device=x.device)
+            return rope_forward(self, x, position_ids)
+
+        if "dynamic" in self.rope_type:
+            dynamic_frequency_update(self, position_ids, device=x.device, layer_type=layer_type)
+        elif self.rope_type == "longrope":
+            longrope_frequency_update(self, position_ids, device=x.device, layer_type=layer_type)
+        return rope_forward(self, x, position_ids, layer_type=layer_type)

     return wrapper

auto/patch_transformers: Phi3RotaryEmbedding.forward -> common_RotaryEmbedding.forward¶

--- original
+++ rewritten
@@ -1,8 +1,16 @@
-@torch.no_grad()
-@dynamic_rope_update  # power user: used with advanced RoPE types (e.g. dynamic rope)
-def forward(self, x, position_ids):
+@patched_dynamic_rope_update
+def forward(self, x, position_ids, layer_type=None):
+    if layer_type is not None:
+        # transformers>=5.0
+        inv_freq = getattr(self, f"{layer_type}_inv_freq")
+        attention_scaling = getattr(self, f"{layer_type}_attention_scaling")
+    else:
+        # transformers<5.0
+        inv_freq = self.inv_freq
+        attention_scaling = self.attention_scaling
+
     inv_freq_expanded = (
-        self.inv_freq[None, :, None].float().expand(position_ids.shape[0], -1, 1).to(x.device)
+        inv_freq[None, :, None].float().expand(position_ids.shape[0], -1, 1).to(x.device)
     )
     position_ids_expanded = position_ids[:, None, :].float()

@@ -12,7 +20,7 @@
     with torch.autocast(device_type=device_type, enabled=False):  # Force float32
         freqs = (inv_freq_expanded.float() @ position_ids_expanded.float()).transpose(1, 2)
         emb = torch.cat((freqs, freqs), dim=-1)
-        cos = emb.cos() * self.attention_scaling
-        sin = emb.sin() * self.attention_scaling
+        cos = emb.cos() * attention_scaling
+        sin = emb.sin() * attention_scaling

     return cos.to(dtype=x.dtype), sin.to(dtype=x.dtype)

--- original
+++ rewritten
@@ -1,99 +1,193 @@
-def dynamic_rope_update(rope_forward):
-    """
-    Decorator function to update the RoPE parameters in the forward pass, if the model is using a dynamic RoPE
-    (i.e. a RoPE implementation that may recompute its frequencies in the forward pass).
+def patched_dynamic_rope_update(rope_forward):
+    """manual patch: ``[patch:transformers.modeling_rope_utils.dynamic_rope_update]``

-    Args:
-        rope_forward (Callable):
-            The forward pass of the RoPE implementation.
+    ``rope_type`` is determined in the constructor of class
+    :class:`transformers.models.phi3.modeling_phi3.Phi3RotaryEmbedding`.

-    Returns:
-        The decorated forward pass.
+    .. code-block:: python
+
+        if hasattr(config, "rope_scaling") and config.rope_scaling is not None:
+            self.rope_type = config.rope_scaling.get(
+                "rope_type", config.rope_scaling.get("type"))
+        else:
+            self.rope_type = "default"
+
+    The original code of the patched function:
+
+    .. code-block:: python
+
+        def dynamic_rope_update(rope_forward):
+            def longrope_frequency_update(self, position_ids, device):
+                seq_len = torch.max(position_ids) + 1
+                if hasattr(self.config, "original_max_position_embeddings"):
+                    original_max_position_embeddings =
+                        self.config.original_max_position_embeddings
+                else:
+                    original_max_position_embeddings =
+                        self.config.max_position_embeddings
+                if seq_len > original_max_position_embeddings:
+                    if not hasattr(self, "long_inv_freq"):
+                        self.long_inv_freq, _ = self.rope_init_fn(
+                            self.config, device, seq_len=original_max_position_embeddings + 1
+                        )
+                    self.register_buffer("inv_freq", self.long_inv_freq, persistent=False)
+                else:
+                    self.original_inv_freq = self.original_inv_freq.to(device)
+                    self.register_buffer("inv_freq", self.original_inv_freq, persistent=False)
+
+            def dynamic_frequency_update(self, position_ids, device):
+                seq_len = torch.max(position_ids) + 1
+                if seq_len > self.max_seq_len_cached:  # growth
+                    inv_freq, self.attention_scaling = self.rope_init_fn(
+                        self.config, device, seq_len=seq_len)
+                    self.register_buffer("inv_freq", inv_freq, persistent=False)
+                    self.max_seq_len_cached = seq_len
+
+                if seq_len < self.original_max_seq_len and
+                        self.max_seq_len_cached > self.original_max_seq_len:
+                    self.original_inv_freq = self.original_inv_freq.to(device)
+                    self.register_buffer("inv_freq", self.original_inv_freq, persistent=False)
+                    self.max_seq_len_cached = self.original_max_seq_len
+
+            @wraps(rope_forward)
+            def wrapper(self, x, position_ids):
+                if "dynamic" in self.rope_type:
+                    dynamic_frequency_update(self, position_ids, device=x.device)
+                elif self.rope_type == "longrope":
+                    longrope_frequency_update(self, position_ids, device=x.device)
+                return rope_forward(self, x, position_ids)
+
+            return wrapper
+
     """

     def longrope_frequency_update(self, position_ids, device, layer_type=None):
-        """Longrope uses long factor if sequence is larger than original pretraining length, short otherwise."""
+        # It is no use to patch the function after the model is created
+        # as rope_init_fn is an attribute set to one function when the model
+        # is created and when no patch is applied yet.
+        # So we select the patched version here.
+        rope_init_fn = _get_rope_init_fn(self, layer_type=layer_type)
         seq_len = torch.max(position_ids) + 1
-        original_max_position_embeddings = getattr(
-            self.config, "original_max_position_embeddings", self.config.max_position_embeddings
-        )
+        if hasattr(self.config, "original_max_position_embeddings"):
+            original_max_position_embeddings = self.config.original_max_position_embeddings
+        else:
+            original_max_position_embeddings = self.config.max_position_embeddings
+
         if layer_type is None:
-            rope_type = self.rope_type
+            # rope_type = self.rope_type
             original_inv_freq = self.original_inv_freq
             prefix = ""
         else:
-            rope_type = self.rope_type[layer_type]
+            # rope_type = self.rope_type[layer_type]
             original_inv_freq = getattr(self, f"{layer_type}_original_inv_freq")
             prefix = f"{layer_type}_"

-        if seq_len > original_max_position_embeddings:
-            if not hasattr(self, f"{layer_type}_long_inv_freq"):
-                rope_init_fn = ROPE_INIT_FUNCTIONS[rope_type]
-                long_inv_freq, _ = rope_init_fn(
-                    self.config,
-                    device,
-                    seq_len=original_max_position_embeddings + 1,
-                    layer_type=layer_type,
-                )
-            self.register_buffer(f"{prefix}inv_freq", long_inv_freq, persistent=False)
-            setattr(self, f"{prefix}long_inv_freq", long_inv_freq)
-        else:
-            # This .to() is needed if the model has been moved to a device after being initialized (because
-            # the buffer is automatically moved, but not the original copy)
-            original_inv_freq = original_inv_freq.to(device)
-            self.register_buffer(f"{prefix}inv_freq", original_inv_freq, persistent=False)
-            setattr(self, f"{prefix}original_inv_freq", original_inv_freq)
+        # At export time, seq_len is unknown.
+        long_inv_freq, _ = rope_init_fn(
+            self.config, device, seq_len=original_max_position_embeddings + 1
+        )
+        original_inv_freq = self.original_inv_freq.to(device)
+
+        # PATCHED: uses torch.cond instead of a test
+        cond = (seq_len > original_max_position_embeddings).item()
+        inv_freq = torch.cond(
+            cond,
+            (lambda x, y: x.clone()),
+            (lambda x, y: y.clone()),
+            [long_inv_freq, original_inv_freq],
+        )
+        setattr(self, f"{prefix}inv_freq", inv_freq)
+        # if seq_len > original_max_position_embeddings:
+        #    self.inv_freq = self.long_inv_freq
+        # else:
+        #    self.inv_freq = self.original_inv_freq

     def dynamic_frequency_update(self, position_ids, device, layer_type=None):
-        """
-        dynamic RoPE layers should recompute `inv_freq` in the following situations:
-        1 - growing beyond the cached sequence length (allow scaling)
-        2 - the current sequence length is in the original scale (avoid losing precision with small sequences)
-        """
+        # constructor:
+        # - self.max_seq_len_cached = config.max_position_embeddings
+        # - self.original_max_seq_len = config.max_position_embeddings
+        # - inv_freq, self.attention_scaling = self.rope_init_fn(self.config, device)
+
+        # It is no use to patch the function after the model is created
+        # as rope_init_fn is an attribute set to one function when the model
+        # is created and when no patch is applied yet.
+        # So we select the patched version here.
+        rope_init_fn = _get_rope_init_fn(self, layer_type=layer_type)
+
+        # This behaviour is difficult to translate.
+        # The sequence always grows.
+        # The test should always True.
+        # So:  self.max_seq_len_cached = max(self.max_seq_len_cached, seq_len) --> seq_len
+        #
+        # if seq_len > self.max_seq_len_cached:  # growth
+        #    inv_freq, self.attention_scaling = self.rope_init_fn(
+        #        self.config, device, seq_len=seq_len
+        #    )
+        #    self.register_buffer("inv_freq", inv_freq, persistent=False)
+        #    self.max_seq_len_cached = seq_len
+        #
+        # So we should not need what follows.
+        #
+        # cond = (seq_len > self.max_seq_len_cached).item()
+        # self.attention_scaling = torch.cond(
+        #    cond,
+        #    (lambda x, y: x.clone()),
+        #    (lambda x, y: y.clone()),
+        #    [attention_scaling, self.attention_scaling],
+        # )
+
         seq_len = torch.max(position_ids) + 1
+        long_inv_freq, self.attention_scaling = rope_init_fn(self.config, device, seq_len=seq_len)
+
         if layer_type is None:
-            rope_type = self.rope_type
-            max_seq_len_cached = self.max_seq_len_cached
+            # rope_type = self.rope_type
+            # max_seq_len_cached = self.max_seq_len_cached
             original_inv_freq = self.original_inv_freq
             prefix = ""
         else:
-            rope_type = self.rope_type[layer_type]
-            max_seq_len_cached = getattr(
-                self, f"{layer_type}_max_seq_len_cached", self.max_seq_len_cached
-            )
+            # rope_type = self.rope_type[layer_type]
+            # max_seq_len_cached = getattr(
+            #     self, f"{layer_type}_max_seq_len_cached", self.max_seq_len_cached
+            # )
             original_inv_freq = getattr(self, f"{layer_type}_original_inv_freq")
             prefix = f"{layer_type}_"

-        if seq_len > max_seq_len_cached:  # growth
-            rope_init_fn = ROPE_INIT_FUNCTIONS[rope_type]
-            inv_freq, self.attention_scaling = rope_init_fn(
-                self.config,
-                device,
-                seq_len=seq_len,
-                layer_type=layer_type,
-            )
-            # TODO joao: may break with compilation
-            self.register_buffer(f"{prefix}inv_freq", inv_freq, persistent=False)
-            setattr(self, f"{layer_type}_max_seq_len_cached", seq_len)
+        # Second test to translate.
+        # Let's keep in mind, self.max_seq_len_cached = seq_len is likely to be True.
+        # But in that case the following condition is a way to restore the original cache.

-        if (
-            seq_len < self.original_max_seq_len and max_seq_len_cached > self.original_max_seq_len
-        ):  # reset
-            # This .to() is needed if the model has been moved to a device after being initialized (because
-            # the buffer is automatically moved, but not the original copy)
-            original_inv_freq = original_inv_freq.to(device)
-            self.register_buffer(f"{prefix}inv_freq", original_inv_freq, persistent=False)
-            setattr(self, f"{prefix}original_inv_freq", original_inv_freq)
-            setattr(self, f"{layer_type}_max_seq_len_cached", self.original_max_seq_len)
+        # if (
+        #    seq_len < self.original_max_seq_len
+        #    and self.max_seq_len_cached > self.original_max_seq_len
+        # ):
+        #    self.original_inv_freq = self.original_inv_freq.to(device)
+        #    self.register_buffer("inv_freq", self.original_inv_freq, persistent=False)
+        #    self.max_seq_len_cached = self.original_max_seq_len
+
+        original_inv_freq = self.original_inv_freq.to(device)
+        cond = (seq_len >= self.original_max_seq_len).item()
+        # PATCHED: uses torch.cond instead of a test
+        inv_freq = torch.cond(
+            cond,
+            (lambda x, y: x.clone()),
+            (lambda x, y: y.clone()),
+            [long_inv_freq, original_inv_freq],
+        )
+        setattr(self, f"{prefix}inv_freq", inv_freq)

     @wraps(rope_forward)
     def wrapper(self, x, position_ids, layer_type=None):
-        rope_type = self.rope_type if layer_type is None else self.rope_type[layer_type]
-        kwargs = {"layer_type": layer_type} if layer_type is not None else {}
-        if "dynamic" in rope_type:
-            dynamic_frequency_update(self, position_ids, device=x.device, **kwargs)
-        elif rope_type == "longrope":
-            longrope_frequency_update(self, position_ids, device=x.device, **kwargs)
-        return rope_forward(self, x, position_ids, **kwargs)
+        if layer_type is None:
+            if "dynamic" in self.rope_type:
+                dynamic_frequency_update(self, position_ids, device=x.device)
+            elif self.rope_type == "longrope":
+                longrope_frequency_update(self, position_ids, device=x.device)
+            return rope_forward(self, x, position_ids)
+
+        if "dynamic" in self.rope_type:
+            dynamic_frequency_update(self, position_ids, device=x.device, layer_type=layer_type)
+        elif self.rope_type == "longrope":
+            longrope_frequency_update(self, position_ids, device=x.device, layer_type=layer_type)
+        return rope_forward(self, x, position_ids, layer_type=layer_type)

     return wrapper

auto/patch_transformers: Phi4MultimodalRotaryEmbedding.forward -> common_RotaryEmbedding.forward¶

--- original
+++ rewritten
@@ -1,8 +1,16 @@
-@torch.no_grad()
-@dynamic_rope_update  # power user: used with advanced RoPE types (e.g. dynamic rope)
-def forward(self, x, position_ids):
+@patched_dynamic_rope_update
+def forward(self, x, position_ids, layer_type=None):
+    if layer_type is not None:
+        # transformers>=5.0
+        inv_freq = getattr(self, f"{layer_type}_inv_freq")
+        attention_scaling = getattr(self, f"{layer_type}_attention_scaling")
+    else:
+        # transformers<5.0
+        inv_freq = self.inv_freq
+        attention_scaling = self.attention_scaling
+
     inv_freq_expanded = (
-        self.inv_freq[None, :, None].float().expand(position_ids.shape[0], -1, 1).to(x.device)
+        inv_freq[None, :, None].float().expand(position_ids.shape[0], -1, 1).to(x.device)
     )
     position_ids_expanded = position_ids[:, None, :].float()

@@ -12,7 +20,7 @@
     with torch.autocast(device_type=device_type, enabled=False):  # Force float32
         freqs = (inv_freq_expanded.float() @ position_ids_expanded.float()).transpose(1, 2)
         emb = torch.cat((freqs, freqs), dim=-1)
-        cos = emb.cos() * self.attention_scaling
-        sin = emb.sin() * self.attention_scaling
+        cos = emb.cos() * attention_scaling
+        sin = emb.sin() * attention_scaling

     return cos.to(dtype=x.dtype), sin.to(dtype=x.dtype)

--- original
+++ rewritten
@@ -1,99 +1,193 @@
-def dynamic_rope_update(rope_forward):
-    """
-    Decorator function to update the RoPE parameters in the forward pass, if the model is using a dynamic RoPE
-    (i.e. a RoPE implementation that may recompute its frequencies in the forward pass).
+def patched_dynamic_rope_update(rope_forward):
+    """manual patch: ``[patch:transformers.modeling_rope_utils.dynamic_rope_update]``

-    Args:
-        rope_forward (Callable):
-            The forward pass of the RoPE implementation.
+    ``rope_type`` is determined in the constructor of class
+    :class:`transformers.models.phi3.modeling_phi3.Phi3RotaryEmbedding`.

-    Returns:
-        The decorated forward pass.
+    .. code-block:: python
+
+        if hasattr(config, "rope_scaling") and config.rope_scaling is not None:
+            self.rope_type = config.rope_scaling.get(
+                "rope_type", config.rope_scaling.get("type"))
+        else:
+            self.rope_type = "default"
+
+    The original code of the patched function:
+
+    .. code-block:: python
+
+        def dynamic_rope_update(rope_forward):
+            def longrope_frequency_update(self, position_ids, device):
+                seq_len = torch.max(position_ids) + 1
+                if hasattr(self.config, "original_max_position_embeddings"):
+                    original_max_position_embeddings =
+                        self.config.original_max_position_embeddings
+                else:
+                    original_max_position_embeddings =
+                        self.config.max_position_embeddings
+                if seq_len > original_max_position_embeddings:
+                    if not hasattr(self, "long_inv_freq"):
+                        self.long_inv_freq, _ = self.rope_init_fn(
+                            self.config, device, seq_len=original_max_position_embeddings + 1
+                        )
+                    self.register_buffer("inv_freq", self.long_inv_freq, persistent=False)
+                else:
+                    self.original_inv_freq = self.original_inv_freq.to(device)
+                    self.register_buffer("inv_freq", self.original_inv_freq, persistent=False)
+
+            def dynamic_frequency_update(self, position_ids, device):
+                seq_len = torch.max(position_ids) + 1
+                if seq_len > self.max_seq_len_cached:  # growth
+                    inv_freq, self.attention_scaling = self.rope_init_fn(
+                        self.config, device, seq_len=seq_len)
+                    self.register_buffer("inv_freq", inv_freq, persistent=False)
+                    self.max_seq_len_cached = seq_len
+
+                if seq_len < self.original_max_seq_len and
+                        self.max_seq_len_cached > self.original_max_seq_len:
+                    self.original_inv_freq = self.original_inv_freq.to(device)
+                    self.register_buffer("inv_freq", self.original_inv_freq, persistent=False)
+                    self.max_seq_len_cached = self.original_max_seq_len
+
+            @wraps(rope_forward)
+            def wrapper(self, x, position_ids):
+                if "dynamic" in self.rope_type:
+                    dynamic_frequency_update(self, position_ids, device=x.device)
+                elif self.rope_type == "longrope":
+                    longrope_frequency_update(self, position_ids, device=x.device)
+                return rope_forward(self, x, position_ids)
+
+            return wrapper
+
     """

     def longrope_frequency_update(self, position_ids, device, layer_type=None):
-        """Longrope uses long factor if sequence is larger than original pretraining length, short otherwise."""
+        # It is no use to patch the function after the model is created
+        # as rope_init_fn is an attribute set to one function when the model
+        # is created and when no patch is applied yet.
+        # So we select the patched version here.
+        rope_init_fn = _get_rope_init_fn(self, layer_type=layer_type)
         seq_len = torch.max(position_ids) + 1
-        original_max_position_embeddings = getattr(
-            self.config, "original_max_position_embeddings", self.config.max_position_embeddings
-        )
+        if hasattr(self.config, "original_max_position_embeddings"):
+            original_max_position_embeddings = self.config.original_max_position_embeddings
+        else:
+            original_max_position_embeddings = self.config.max_position_embeddings
+
         if layer_type is None:
-            rope_type = self.rope_type
+            # rope_type = self.rope_type
             original_inv_freq = self.original_inv_freq
             prefix = ""
         else:
-            rope_type = self.rope_type[layer_type]
+            # rope_type = self.rope_type[layer_type]
             original_inv_freq = getattr(self, f"{layer_type}_original_inv_freq")
             prefix = f"{layer_type}_"

-        if seq_len > original_max_position_embeddings:
-            if not hasattr(self, f"{layer_type}_long_inv_freq"):
-                rope_init_fn = ROPE_INIT_FUNCTIONS[rope_type]
-                long_inv_freq, _ = rope_init_fn(
-                    self.config,
-                    device,
-                    seq_len=original_max_position_embeddings + 1,
-                    layer_type=layer_type,
-                )
-            self.register_buffer(f"{prefix}inv_freq", long_inv_freq, persistent=False)
-            setattr(self, f"{prefix}long_inv_freq", long_inv_freq)
-        else:
-            # This .to() is needed if the model has been moved to a device after being initialized (because
-            # the buffer is automatically moved, but not the original copy)
-            original_inv_freq = original_inv_freq.to(device)
-            self.register_buffer(f"{prefix}inv_freq", original_inv_freq, persistent=False)
-            setattr(self, f"{prefix}original_inv_freq", original_inv_freq)
+        # At export time, seq_len is unknown.
+        long_inv_freq, _ = rope_init_fn(
+            self.config, device, seq_len=original_max_position_embeddings + 1
+        )
+        original_inv_freq = self.original_inv_freq.to(device)
+
+        # PATCHED: uses torch.cond instead of a test
+        cond = (seq_len > original_max_position_embeddings).item()
+        inv_freq = torch.cond(
+            cond,
+            (lambda x, y: x.clone()),
+            (lambda x, y: y.clone()),
+            [long_inv_freq, original_inv_freq],
+        )
+        setattr(self, f"{prefix}inv_freq", inv_freq)
+        # if seq_len > original_max_position_embeddings:
+        #    self.inv_freq = self.long_inv_freq
+        # else:
+        #    self.inv_freq = self.original_inv_freq

     def dynamic_frequency_update(self, position_ids, device, layer_type=None):
-        """
-        dynamic RoPE layers should recompute `inv_freq` in the following situations:
-        1 - growing beyond the cached sequence length (allow scaling)
-        2 - the current sequence length is in the original scale (avoid losing precision with small sequences)
-        """
+        # constructor:
+        # - self.max_seq_len_cached = config.max_position_embeddings
+        # - self.original_max_seq_len = config.max_position_embeddings
+        # - inv_freq, self.attention_scaling = self.rope_init_fn(self.config, device)
+
+        # It is no use to patch the function after the model is created
+        # as rope_init_fn is an attribute set to one function when the model
+        # is created and when no patch is applied yet.
+        # So we select the patched version here.
+        rope_init_fn = _get_rope_init_fn(self, layer_type=layer_type)
+
+        # This behaviour is difficult to translate.
+        # The sequence always grows.
+        # The test should always True.
+        # So:  self.max_seq_len_cached = max(self.max_seq_len_cached, seq_len) --> seq_len
+        #
+        # if seq_len > self.max_seq_len_cached:  # growth
+        #    inv_freq, self.attention_scaling = self.rope_init_fn(
+        #        self.config, device, seq_len=seq_len
+        #    )
+        #    self.register_buffer("inv_freq", inv_freq, persistent=False)
+        #    self.max_seq_len_cached = seq_len
+        #
+        # So we should not need what follows.
+        #
+        # cond = (seq_len > self.max_seq_len_cached).item()
+        # self.attention_scaling = torch.cond(
+        #    cond,
+        #    (lambda x, y: x.clone()),
+        #    (lambda x, y: y.clone()),
+        #    [attention_scaling, self.attention_scaling],
+        # )
+
         seq_len = torch.max(position_ids) + 1
+        long_inv_freq, self.attention_scaling = rope_init_fn(self.config, device, seq_len=seq_len)
+
         if layer_type is None:
-            rope_type = self.rope_type
-            max_seq_len_cached = self.max_seq_len_cached
+            # rope_type = self.rope_type
+            # max_seq_len_cached = self.max_seq_len_cached
             original_inv_freq = self.original_inv_freq
             prefix = ""
         else:
-            rope_type = self.rope_type[layer_type]
-            max_seq_len_cached = getattr(
-                self, f"{layer_type}_max_seq_len_cached", self.max_seq_len_cached
-            )
+            # rope_type = self.rope_type[layer_type]
+            # max_seq_len_cached = getattr(
+            #     self, f"{layer_type}_max_seq_len_cached", self.max_seq_len_cached
+            # )
             original_inv_freq = getattr(self, f"{layer_type}_original_inv_freq")
             prefix = f"{layer_type}_"

-        if seq_len > max_seq_len_cached:  # growth
-            rope_init_fn = ROPE_INIT_FUNCTIONS[rope_type]
-            inv_freq, self.attention_scaling = rope_init_fn(
-                self.config,
-                device,
-                seq_len=seq_len,
-                layer_type=layer_type,
-            )
-            # TODO joao: may break with compilation
-            self.register_buffer(f"{prefix}inv_freq", inv_freq, persistent=False)
-            setattr(self, f"{layer_type}_max_seq_len_cached", seq_len)
+        # Second test to translate.
+        # Let's keep in mind, self.max_seq_len_cached = seq_len is likely to be True.
+        # But in that case the following condition is a way to restore the original cache.

-        if (
-            seq_len < self.original_max_seq_len and max_seq_len_cached > self.original_max_seq_len
-        ):  # reset
-            # This .to() is needed if the model has been moved to a device after being initialized (because
-            # the buffer is automatically moved, but not the original copy)
-            original_inv_freq = original_inv_freq.to(device)
-            self.register_buffer(f"{prefix}inv_freq", original_inv_freq, persistent=False)
-            setattr(self, f"{prefix}original_inv_freq", original_inv_freq)
-            setattr(self, f"{layer_type}_max_seq_len_cached", self.original_max_seq_len)
+        # if (
+        #    seq_len < self.original_max_seq_len
+        #    and self.max_seq_len_cached > self.original_max_seq_len
+        # ):
+        #    self.original_inv_freq = self.original_inv_freq.to(device)
+        #    self.register_buffer("inv_freq", self.original_inv_freq, persistent=False)
+        #    self.max_seq_len_cached = self.original_max_seq_len
+
+        original_inv_freq = self.original_inv_freq.to(device)
+        cond = (seq_len >= self.original_max_seq_len).item()
+        # PATCHED: uses torch.cond instead of a test
+        inv_freq = torch.cond(
+            cond,
+            (lambda x, y: x.clone()),
+            (lambda x, y: y.clone()),
+            [long_inv_freq, original_inv_freq],
+        )
+        setattr(self, f"{prefix}inv_freq", inv_freq)

     @wraps(rope_forward)
     def wrapper(self, x, position_ids, layer_type=None):
-        rope_type = self.rope_type if layer_type is None else self.rope_type[layer_type]
-        kwargs = {"layer_type": layer_type} if layer_type is not None else {}
-        if "dynamic" in rope_type:
-            dynamic_frequency_update(self, position_ids, device=x.device, **kwargs)
-        elif rope_type == "longrope":
-            longrope_frequency_update(self, position_ids, device=x.device, **kwargs)
-        return rope_forward(self, x, position_ids, **kwargs)
+        if layer_type is None:
+            if "dynamic" in self.rope_type:
+                dynamic_frequency_update(self, position_ids, device=x.device)
+            elif self.rope_type == "longrope":
+                longrope_frequency_update(self, position_ids, device=x.device)
+            return rope_forward(self, x, position_ids)
+
+        if "dynamic" in self.rope_type:
+            dynamic_frequency_update(self, position_ids, device=x.device, layer_type=layer_type)
+        elif self.rope_type == "longrope":
+            longrope_frequency_update(self, position_ids, device=x.device, layer_type=layer_type)
+        return rope_forward(self, x, position_ids, layer_type=layer_type)

     return wrapper

auto/patch_transformers: PhiRotaryEmbedding.forward -> common_RotaryEmbedding.forward¶

--- original
+++ rewritten
@@ -1,8 +1,16 @@
-@torch.no_grad()
-@dynamic_rope_update  # power user: used with advanced RoPE types (e.g. dynamic rope)
-def forward(self, x, position_ids):
+@patched_dynamic_rope_update
+def forward(self, x, position_ids, layer_type=None):
+    if layer_type is not None:
+        # transformers>=5.0
+        inv_freq = getattr(self, f"{layer_type}_inv_freq")
+        attention_scaling = getattr(self, f"{layer_type}_attention_scaling")
+    else:
+        # transformers<5.0
+        inv_freq = self.inv_freq
+        attention_scaling = self.attention_scaling
+
     inv_freq_expanded = (
-        self.inv_freq[None, :, None].float().expand(position_ids.shape[0], -1, 1).to(x.device)
+        inv_freq[None, :, None].float().expand(position_ids.shape[0], -1, 1).to(x.device)
     )
     position_ids_expanded = position_ids[:, None, :].float()

@@ -12,7 +20,7 @@
     with torch.autocast(device_type=device_type, enabled=False):  # Force float32
         freqs = (inv_freq_expanded.float() @ position_ids_expanded.float()).transpose(1, 2)
         emb = torch.cat((freqs, freqs), dim=-1)
-        cos = emb.cos() * self.attention_scaling
-        sin = emb.sin() * self.attention_scaling
+        cos = emb.cos() * attention_scaling
+        sin = emb.sin() * attention_scaling

     return cos.to(dtype=x.dtype), sin.to(dtype=x.dtype)

--- original
+++ rewritten
@@ -1,99 +1,193 @@
-def dynamic_rope_update(rope_forward):
-    """
-    Decorator function to update the RoPE parameters in the forward pass, if the model is using a dynamic RoPE
-    (i.e. a RoPE implementation that may recompute its frequencies in the forward pass).
+def patched_dynamic_rope_update(rope_forward):
+    """manual patch: ``[patch:transformers.modeling_rope_utils.dynamic_rope_update]``

-    Args:
-        rope_forward (Callable):
-            The forward pass of the RoPE implementation.
+    ``rope_type`` is determined in the constructor of class
+    :class:`transformers.models.phi3.modeling_phi3.Phi3RotaryEmbedding`.

-    Returns:
-        The decorated forward pass.
+    .. code-block:: python
+
+        if hasattr(config, "rope_scaling") and config.rope_scaling is not None:
+            self.rope_type = config.rope_scaling.get(
+                "rope_type", config.rope_scaling.get("type"))
+        else:
+            self.rope_type = "default"
+
+    The original code of the patched function:
+
+    .. code-block:: python
+
+        def dynamic_rope_update(rope_forward):
+            def longrope_frequency_update(self, position_ids, device):
+                seq_len = torch.max(position_ids) + 1
+                if hasattr(self.config, "original_max_position_embeddings"):
+                    original_max_position_embeddings =
+                        self.config.original_max_position_embeddings
+                else:
+                    original_max_position_embeddings =
+                        self.config.max_position_embeddings
+                if seq_len > original_max_position_embeddings:
+                    if not hasattr(self, "long_inv_freq"):
+                        self.long_inv_freq, _ = self.rope_init_fn(
+                            self.config, device, seq_len=original_max_position_embeddings + 1
+                        )
+                    self.register_buffer("inv_freq", self.long_inv_freq, persistent=False)
+                else:
+                    self.original_inv_freq = self.original_inv_freq.to(device)
+                    self.register_buffer("inv_freq", self.original_inv_freq, persistent=False)
+
+            def dynamic_frequency_update(self, position_ids, device):
+                seq_len = torch.max(position_ids) + 1
+                if seq_len > self.max_seq_len_cached:  # growth
+                    inv_freq, self.attention_scaling = self.rope_init_fn(
+                        self.config, device, seq_len=seq_len)
+                    self.register_buffer("inv_freq", inv_freq, persistent=False)
+                    self.max_seq_len_cached = seq_len
+
+                if seq_len < self.original_max_seq_len and
+                        self.max_seq_len_cached > self.original_max_seq_len:
+                    self.original_inv_freq = self.original_inv_freq.to(device)
+                    self.register_buffer("inv_freq", self.original_inv_freq, persistent=False)
+                    self.max_seq_len_cached = self.original_max_seq_len
+
+            @wraps(rope_forward)
+            def wrapper(self, x, position_ids):
+                if "dynamic" in self.rope_type:
+                    dynamic_frequency_update(self, position_ids, device=x.device)
+                elif self.rope_type == "longrope":
+                    longrope_frequency_update(self, position_ids, device=x.device)
+                return rope_forward(self, x, position_ids)
+
+            return wrapper
+
     """

     def longrope_frequency_update(self, position_ids, device, layer_type=None):
-        """Longrope uses long factor if sequence is larger than original pretraining length, short otherwise."""
+        # It is no use to patch the function after the model is created
+        # as rope_init_fn is an attribute set to one function when the model
+        # is created and when no patch is applied yet.
+        # So we select the patched version here.
+        rope_init_fn = _get_rope_init_fn(self, layer_type=layer_type)
         seq_len = torch.max(position_ids) + 1
-        original_max_position_embeddings = getattr(
-            self.config, "original_max_position_embeddings", self.config.max_position_embeddings
-        )
+        if hasattr(self.config, "original_max_position_embeddings"):
+            original_max_position_embeddings = self.config.original_max_position_embeddings
+        else:
+            original_max_position_embeddings = self.config.max_position_embeddings
+
         if layer_type is None:
-            rope_type = self.rope_type
+            # rope_type = self.rope_type
             original_inv_freq = self.original_inv_freq
             prefix = ""
         else:
-            rope_type = self.rope_type[layer_type]
+            # rope_type = self.rope_type[layer_type]
             original_inv_freq = getattr(self, f"{layer_type}_original_inv_freq")
             prefix = f"{layer_type}_"

-        if seq_len > original_max_position_embeddings:
-            if not hasattr(self, f"{layer_type}_long_inv_freq"):
-                rope_init_fn = ROPE_INIT_FUNCTIONS[rope_type]
-                long_inv_freq, _ = rope_init_fn(
-                    self.config,
-                    device,
-                    seq_len=original_max_position_embeddings + 1,
-                    layer_type=layer_type,
-                )
-            self.register_buffer(f"{prefix}inv_freq", long_inv_freq, persistent=False)
-            setattr(self, f"{prefix}long_inv_freq", long_inv_freq)
-        else:
-            # This .to() is needed if the model has been moved to a device after being initialized (because
-            # the buffer is automatically moved, but not the original copy)
-            original_inv_freq = original_inv_freq.to(device)
-            self.register_buffer(f"{prefix}inv_freq", original_inv_freq, persistent=False)
-            setattr(self, f"{prefix}original_inv_freq", original_inv_freq)
+        # At export time, seq_len is unknown.
+        long_inv_freq, _ = rope_init_fn(
+            self.config, device, seq_len=original_max_position_embeddings + 1
+        )
+        original_inv_freq = self.original_inv_freq.to(device)
+
+        # PATCHED: uses torch.cond instead of a test
+        cond = (seq_len > original_max_position_embeddings).item()
+        inv_freq = torch.cond(
+            cond,
+            (lambda x, y: x.clone()),
+            (lambda x, y: y.clone()),
+            [long_inv_freq, original_inv_freq],
+        )
+        setattr(self, f"{prefix}inv_freq", inv_freq)
+        # if seq_len > original_max_position_embeddings:
+        #    self.inv_freq = self.long_inv_freq
+        # else:
+        #    self.inv_freq = self.original_inv_freq

     def dynamic_frequency_update(self, position_ids, device, layer_type=None):
-        """
-        dynamic RoPE layers should recompute `inv_freq` in the following situations:
-        1 - growing beyond the cached sequence length (allow scaling)
-        2 - the current sequence length is in the original scale (avoid losing precision with small sequences)
-        """
+        # constructor:
+        # - self.max_seq_len_cached = config.max_position_embeddings
+        # - self.original_max_seq_len = config.max_position_embeddings
+        # - inv_freq, self.attention_scaling = self.rope_init_fn(self.config, device)
+
+        # It is no use to patch the function after the model is created
+        # as rope_init_fn is an attribute set to one function when the model
+        # is created and when no patch is applied yet.
+        # So we select the patched version here.
+        rope_init_fn = _get_rope_init_fn(self, layer_type=layer_type)
+
+        # This behaviour is difficult to translate.
+        # The sequence always grows.
+        # The test should always True.
+        # So:  self.max_seq_len_cached = max(self.max_seq_len_cached, seq_len) --> seq_len
+        #
+        # if seq_len > self.max_seq_len_cached:  # growth
+        #    inv_freq, self.attention_scaling = self.rope_init_fn(
+        #        self.config, device, seq_len=seq_len
+        #    )
+        #    self.register_buffer("inv_freq", inv_freq, persistent=False)
+        #    self.max_seq_len_cached = seq_len
+        #
+        # So we should not need what follows.
+        #
+        # cond = (seq_len > self.max_seq_len_cached).item()
+        # self.attention_scaling = torch.cond(
+        #    cond,
+        #    (lambda x, y: x.clone()),
+        #    (lambda x, y: y.clone()),
+        #    [attention_scaling, self.attention_scaling],
+        # )
+
         seq_len = torch.max(position_ids) + 1
+        long_inv_freq, self.attention_scaling = rope_init_fn(self.config, device, seq_len=seq_len)
+
         if layer_type is None:
-            rope_type = self.rope_type
-            max_seq_len_cached = self.max_seq_len_cached
+            # rope_type = self.rope_type
+            # max_seq_len_cached = self.max_seq_len_cached
             original_inv_freq = self.original_inv_freq
             prefix = ""
         else:
-            rope_type = self.rope_type[layer_type]
-            max_seq_len_cached = getattr(
-                self, f"{layer_type}_max_seq_len_cached", self.max_seq_len_cached
-            )
+            # rope_type = self.rope_type[layer_type]
+            # max_seq_len_cached = getattr(
+            #     self, f"{layer_type}_max_seq_len_cached", self.max_seq_len_cached
+            # )
             original_inv_freq = getattr(self, f"{layer_type}_original_inv_freq")
             prefix = f"{layer_type}_"

-        if seq_len > max_seq_len_cached:  # growth
-            rope_init_fn = ROPE_INIT_FUNCTIONS[rope_type]
-            inv_freq, self.attention_scaling = rope_init_fn(
-                self.config,
-                device,
-                seq_len=seq_len,
-                layer_type=layer_type,
-            )
-            # TODO joao: may break with compilation
-            self.register_buffer(f"{prefix}inv_freq", inv_freq, persistent=False)
-            setattr(self, f"{layer_type}_max_seq_len_cached", seq_len)
+        # Second test to translate.
+        # Let's keep in mind, self.max_seq_len_cached = seq_len is likely to be True.
+        # But in that case the following condition is a way to restore the original cache.

-        if (
-            seq_len < self.original_max_seq_len and max_seq_len_cached > self.original_max_seq_len
-        ):  # reset
-            # This .to() is needed if the model has been moved to a device after being initialized (because
-            # the buffer is automatically moved, but not the original copy)
-            original_inv_freq = original_inv_freq.to(device)
-            self.register_buffer(f"{prefix}inv_freq", original_inv_freq, persistent=False)
-            setattr(self, f"{prefix}original_inv_freq", original_inv_freq)
-            setattr(self, f"{layer_type}_max_seq_len_cached", self.original_max_seq_len)
+        # if (
+        #    seq_len < self.original_max_seq_len
+        #    and self.max_seq_len_cached > self.original_max_seq_len
+        # ):
+        #    self.original_inv_freq = self.original_inv_freq.to(device)
+        #    self.register_buffer("inv_freq", self.original_inv_freq, persistent=False)
+        #    self.max_seq_len_cached = self.original_max_seq_len
+
+        original_inv_freq = self.original_inv_freq.to(device)
+        cond = (seq_len >= self.original_max_seq_len).item()
+        # PATCHED: uses torch.cond instead of a test
+        inv_freq = torch.cond(
+            cond,
+            (lambda x, y: x.clone()),
+            (lambda x, y: y.clone()),
+            [long_inv_freq, original_inv_freq],
+        )
+        setattr(self, f"{prefix}inv_freq", inv_freq)

     @wraps(rope_forward)
     def wrapper(self, x, position_ids, layer_type=None):
-        rope_type = self.rope_type if layer_type is None else self.rope_type[layer_type]
-        kwargs = {"layer_type": layer_type} if layer_type is not None else {}
-        if "dynamic" in rope_type:
-            dynamic_frequency_update(self, position_ids, device=x.device, **kwargs)
-        elif rope_type == "longrope":
-            longrope_frequency_update(self, position_ids, device=x.device, **kwargs)
-        return rope_forward(self, x, position_ids, **kwargs)
+        if layer_type is None:
+            if "dynamic" in self.rope_type:
+                dynamic_frequency_update(self, position_ids, device=x.device)
+            elif self.rope_type == "longrope":
+                longrope_frequency_update(self, position_ids, device=x.device)
+            return rope_forward(self, x, position_ids)
+
+        if "dynamic" in self.rope_type:
+            dynamic_frequency_update(self, position_ids, device=x.device, layer_type=layer_type)
+        elif self.rope_type == "longrope":
+            longrope_frequency_update(self, position_ids, device=x.device, layer_type=layer_type)
+        return rope_forward(self, x, position_ids, layer_type=layer_type)

     return wrapper

auto/patch_transformers: Qwen2_5_VLForConditionalGeneration.prepare_inputs_for_generation -> patched_Qwen2_5_VLForConditionalGeneration.prepare_inputs_for_generation¶

--- original
+++ rewritten
@@ -14,9 +14,12 @@
     second_per_grid_ts=None,
     **kwargs,
 ):
-    # Overwritten -- in specific circumstances we don't want to forward image inputs to the model
+    # Overwritten -- in specific circumstances we don't want to f
+    # forward image inputs to the model
+    from transformers.generation import GenerationMixin

-    model_inputs = super().prepare_inputs_for_generation(
+    model_inputs = GenerationMixin.prepare_inputs_for_generation(
+        self,
         input_ids,
         past_key_values=past_key_values,
         attention_mask=attention_mask,
@@ -36,8 +39,8 @@
     if position_ids is None:
         # Calculate RoPE index once per generation in the pre-fill stage only.
         # When compiling, we can't check tensor values thus we check only input length
-        # It is safe to assume that `length!=1` means we're in pre-fill because compiled
-        # models currently cannot do assisted decoding
+        # It is safe to assume that `length!=1` means we're in pre-fill
+        # because compiled models currently cannot do assisted decoding
         if cache_position[0] == 0 or self.model.rope_deltas is None:
             vision_positions, rope_deltas = self.model.get_rope_index(
                 model_inputs.get("input_ids", None),
@@ -48,7 +51,7 @@
             )
             self.model.rope_deltas = rope_deltas
         # then use the prev pre-calculated rope-deltas to get the correct position ids
-        elif "position_ids" in model_inputs:
+        elif "position_ids" in model_inputs and model_inputs["position_ids"] is not None:
             batch_size, seq_length = model_inputs["position_ids"].shape
             device = model_inputs["position_ids"].device
             position_ids = torch.arange(seq_length, device=device)
@@ -58,7 +61,14 @@
             vision_positions = position_ids + delta.expand_as(position_ids)

         # Concatenate "text + vision" positions into [4, bs, seq-len]
-        text_positions = model_inputs["position_ids"][None, ...]
+        if "position_ids" not in model_inputs or model_inputs["position_ids"] is None:
+            text_positions = torch.arange(input_ids.shape[1], device=input_ids.device)[
+                None, None, :
+            ]
+        else:
+            text_positions = model_inputs["position_ids"][None, ...]
+        # text_positions = model_inputs["position_ids"][None, ...]
+        assert vision_positions is not None, "vision_positions are missing"
         model_inputs["position_ids"] = torch.cat([text_positions, vision_positions], dim=0)

     if cache_position[0] != 0:

auto/patch_transformers: Qwen2_5_VLVisionAttention.forward -> patched_Qwen2_5_VLVisionAttention.forward¶

--- original
+++ rewritten
@@ -7,24 +7,59 @@
     **kwargs,
 ) -> torch.Tensor:
     seq_length = hidden_states.shape[0]
-    query_states, key_states, value_states = (
-        self.qkv(hidden_states)
-        .reshape(seq_length, 3, self.num_heads, -1)
-        .permute(1, 0, 2, 3)
-        .unbind(0)
+    # PATCHED: avoid the use of unbind
+    qkv = self.qkv(hidden_states).reshape(seq_length, 3, self.num_heads, -1).permute(1, 0, 2, 3)
+
+    query_states, key_states, value_states = qkv[0], qkv[1], qkv[2]
+    cos, sin = position_embeddings
+
+    # This part should be moved into the loop
+    # iteration to enable fusion inside the loop.
+    query_states, key_states = (
+        transformers.models.qwen2_5_vl.modeling_qwen2_5_vl.apply_rotary_pos_emb_vision(
+            query_states, key_states, cos, sin
+        )
     )
-    cos, sin = position_embeddings
-    query_states, key_states = apply_rotary_pos_emb_vision(query_states, key_states, cos, sin)

     query_states = query_states.transpose(0, 1).unsqueeze(0)
     key_states = key_states.transpose(0, 1).unsqueeze(0)
     value_states = value_states.transpose(0, 1).unsqueeze(0)

-    attention_interface: Callable = eager_attention_forward
+    attention_interface: Callable = (
+        transformers.models.qwen2_5_vl.modeling_qwen2_5_vl.eager_attention_forward
+    )
     if self.config._attn_implementation != "eager":
-        attention_interface = ALL_ATTENTION_FUNCTIONS[self.config._attn_implementation]
+        # PATCHED
+        # attention_interface = ALL_ATTENTION_FUNCTIONS[
+        #       self.config._attn_implementation]
+        attention_interface = transformers.modeling_utils.ALL_ATTENTION_FUNCTIONS[
+            self.config._attn_implementation
+        ]

-    if self.config._attn_implementation == "flash_attention_2":
+    if self.config._attn_implementation == "flash_attention_2" and _is_torchdynamo_exporting():
+        max_seqlen = (cu_seqlens[1:] - cu_seqlens[:-1]).max()
+        attn_output = torch.onnx.ops.symbolic(
+            "custom::qwen25_attention",
+            (
+                query_states,
+                key_states,
+                value_states,
+                cu_seqlens,
+                cu_seqlens,
+                max_seqlen,
+                max_seqlen,
+                torch.tensor(self.scaling, dtype=torch.float32),
+            ),
+            dtype=query_states.dtype,
+            shape=(
+                key_states.shape[0],
+                value_states.shape[1],
+                max_seqlen,
+                value_states.shape[-1],
+            ),
+            version=1,
+        )
+    elif self.config._attn_implementation == "flash_attention_2":
         # Flash Attention 2: Use cu_seqlens for variable length attention
         max_seqlen = (cu_seqlens[1:] - cu_seqlens[:-1]).max()
         attn_output, _ = attention_interface(
@@ -42,6 +77,67 @@
             is_causal=False,
             **kwargs,
         )
+    elif _is_torchdynamo_exporting():
+        if attention_interface is transformers.integrations.sdpa_attention.sdpa_attention_forward:
+            attention_interface = patched_sdpa_attention_forward
+
+        if use_loop_for_attention_in_qwen_2_5:
+
+            def _iteration(start_end, query_states, key_states, value_states):
+                return patched_Qwen2_5_VLVisionAttentionOneIteration.forward(
+                    self,
+                    start_end,
+                    query_states,
+                    key_states,
+                    value_states,
+                    scaling=self.scaling,
+                    dropout=0.0 if not self.training else self.attention_dropout,
+                )
+
+            starts = cu_seqlens[:-1]
+            ends = cu_seqlens[1:]
+            # cu_seqlens = [0, 10, 14, 27]
+            # starts: [0, 10, 14]
+            # ends: [10, 14, 17]
+            # starts_ends: [[0, 10], [10, 14], [14, 27]]
+            starts_ends = torch.cat([starts.unsqueeze(1), ends.unsqueeze(1)], dim=1)
+            attn_outputs = [
+                _iteration(start_end, query_states, key_states, value_states)
+                for start_end in starts_ends
+            ]
+            # attn_outputs = torch._higher_order_ops.while_loop(
+            # attn_outputs = torch.ops.higher_order.while_loop(
+            #    (lambda it, starts_ends, *_args: it < starts_ends.shape[0]),
+            #    _iteration,
+            #    (torch.tensor(0),
+            #       starts_ends, query_states, key_states, value_states), tuple(),
+            # )
+            attn_output = torch.cat(attn_outputs, dim=1)
+        else:
+            # make square mask
+            indices = torch.arange(
+                cu_seqlens.max(), dtype=cu_seqlens.dtype, device=cu_seqlens.device
+            )
+            dot = (cu_seqlens.unsqueeze(1) <= indices.unsqueeze(0)).to(cu_seqlens.dtype)
+            dot = dot.sum(dim=0)
+            mask = dot.unsqueeze(1) - dot.unsqueeze(0)
+            bool_mask = mask == 0
+            bool_mask = bool_mask.unsqueeze(0).unsqueeze(0)
+
+            torch._check(bool_mask.shape[2] == key_states.shape[2])
+            torch._check(bool_mask.shape[3] == key_states.shape[2])
+
+            attn_output, _ = attention_interface(
+                self,
+                query_states,
+                key_states,
+                value_states,
+                attention_mask=bool_mask,
+                scaling=self.scaling,
+                dropout=0.0 if not self.training else self.attention_dropout,
+                is_causal=False,
+                **kwargs,
+            )
     else:
         # Other implementations: Process each chunk separately
         lengths = cu_seqlens[1:] - cu_seqlens[:-1]

auto/patch_transformers: Qwen2_5_VisionTransformerPretrainedModel.get_window_index -> patched_Qwen2_5_VisionTransformerPretrainedModel.get_window_index¶

--- original
+++ rewritten
@@ -1,10 +1,15 @@
 def get_window_index(self, grid_thw):
-    window_index: list = []
-    cu_window_seqlens: list = [0]
+    window_index: list = []  # type: ignore[annotation-unchecked]
+    # PATCHED
+    cu_window_seqlens: list = [torch.tensor([0], dtype=torch.int64)]  # type: ignore[annotation-unchecked]
     window_index_id = 0
     vit_merger_window_size = self.window_size // self.spatial_merge_size // self.patch_size

-    for grid_t, grid_h, grid_w in grid_thw:
+    for _thw in grid_thw:
+        # PATCHED: avoid unbind
+        grid_t = _thw[0]
+        grid_h = _thw[1]
+        grid_w = _thw[2]
         llm_grid_h, llm_grid_w = (
             grid_h // self.spatial_merge_size,
             grid_w // self.spatial_merge_size,
@@ -34,9 +39,11 @@
         index_padded = index_padded.reshape(-1)
         index_new = index_padded[index_padded != -100]
         window_index.append(index_new + window_index_id)
-        cu_seqlens_tmp = seqlens.cumsum(0) * self.spatial_merge_unit + cu_window_seqlens[-1]
-        cu_window_seqlens.extend(cu_seqlens_tmp.tolist())
+        cu_seqlens_tmp = seqlens.cumsum(0) * self.spatial_merge_unit + cu_window_seqlens[-1][-1:]
+        # PATCHED
+        # cu_window_seqlens.extend(cu_seqlens_tmp.tolist())
+        cu_window_seqlens.append(cu_seqlens_tmp)
         window_index_id += (grid_t * llm_grid_h * llm_grid_w).item()
     window_index = torch.cat(window_index, dim=0)

-    return window_index, cu_window_seqlens
+    return window_index, torch.cat(cu_window_seqlens, dim=0)

auto/patch_transformers: Qwen2_5_VisionTransformerPretrainedModel.forward -> patched_Qwen2_5_VisionTransformerPretrainedModel.forward¶

--- original
+++ rewritten
@@ -12,11 +12,13 @@
     hidden_states = self.patch_embed(hidden_states)
     rotary_pos_emb = self.rot_pos_emb(grid_thw)
     window_index, cu_window_seqlens = self.get_window_index(grid_thw)
-    cu_window_seqlens = torch.tensor(
-        cu_window_seqlens,
-        device=hidden_states.device,
-        dtype=grid_thw.dtype if torch.jit.is_tracing() else torch.int32,
-    )
+    # PATCHED
+    # cu_window_seqlens = torch.tensor(
+    #    cu_window_seqlens,
+    #    device=hidden_states.device,
+    #    dtype=grid_thw.dtype if torch.jit.is_tracing() else torch.int32,
+    # )
+    cu_window_seqlens = cu_window_seqlens.to(hidden_states.device).to(grid_thw.dtype)
     cu_window_seqlens = torch.unique_consecutive(cu_window_seqlens)

     seq_len, _ = hidden_states.size()
@@ -37,8 +39,10 @@
         dim=0,
         # Select dtype based on the following factors:
         #  - FA2 requires that cu_seqlens_q must have dtype int32
-        #  - torch.onnx.export requires that cu_seqlens_q must have same dtype as grid_thw
-        # See https://github.com/huggingface/transformers/pull/34852 for more information
+        #  - torch.onnx.export requires that cu_seqlens_q must have same dtype
+        # as grid_thw
+        # See https://github.com/huggingface/transformers/pull/34852
+        # for more information
         dtype=grid_thw.dtype if torch.jit.is_tracing() else torch.int32,
     )
     cu_seqlens = F.pad(cu_seqlens, (1, 0), value=0)
@@ -59,5 +63,4 @@
     hidden_states = self.merger(hidden_states)
     reverse_indices = torch.argsort(window_index)
     hidden_states = hidden_states[reverse_indices, :]
-
     return hidden_states

auto/patch_transformers: Qwen2_5_VisionTransformerPretrainedModel.rot_pos_emb -> patched_Qwen2_5_VisionTransformerPretrainedModel.rot_pos_emb¶

--- original
+++ rewritten
@@ -1,6 +1,10 @@
 def rot_pos_emb(self, grid_thw):
     pos_ids = []
-    for t, h, w in grid_thw:
+    for thw_ in grid_thw:
+        # PATCHED: avoid unbind
+        t = thw_[0]
+        h = thw_[1]
+        w = thw_[2]
         hpos_ids = torch.arange(h).unsqueeze(1).expand(-1, w)
         hpos_ids = hpos_ids.reshape(
             h // self.spatial_merge_size,

auto/patch_transformers: Qwen3MoeSparseMoeBlock.forward -> patched_Qwen3MoeSparseMoeBlock.forward¶

--- original
+++ rewritten
@@ -1,6 +1,67 @@
-def forward(self, hidden_states: torch.Tensor) -> tuple[torch.Tensor, torch.Tensor]:
+def forward(self, hidden_states: torch.Tensor) -> torch.Tensor:
+    """ """
     batch_size, sequence_length, hidden_dim = hidden_states.shape
-    hidden_states_reshaped = hidden_states.view(-1, hidden_dim)
-    routing_weights, selected_experts = self.router(hidden_states_reshaped)
-    final_hidden_states = self.experts(hidden_states_reshaped, selected_experts, routing_weights)
-    return final_hidden_states.reshape(batch_size, sequence_length, hidden_dim)
+    hidden_states = hidden_states.view(-1, hidden_dim)
+    # router_logits: (batch * sequence_length, n_experts)
+    router_logits = self.gate(hidden_states)
+
+    routing_weights = torch.nn.functional.softmax(router_logits, dim=1, dtype=torch.float)
+    routing_weights, selected_experts = torch.topk(routing_weights, self.top_k, dim=-1)
+    if self.norm_topk_prob:  # only diff with mixtral sparse moe block!
+        routing_weights /= routing_weights.sum(dim=-1, keepdim=True)
+    # we cast back to the input dtype
+    routing_weights = routing_weights.to(hidden_states.dtype)
+
+    final_hidden_states = torch.zeros(
+        (batch_size * sequence_length, hidden_dim),
+        dtype=hidden_states.dtype,
+        device=hidden_states.device,
+    )
+
+    # One hot encode the selected experts to create an expert mask
+    # this will be used to easily index which expert is going to be sollicitated
+    expert_mask = torch.nn.functional.one_hot(
+        selected_experts, num_classes=self.num_experts
+    ).permute(2, 1, 0)
+
+    # Loop over all available experts in the model
+    # and perform the computation on each expert
+    expert_sum = expert_mask.sum(dim=(-1, -2))
+    # expert_hit = torch.greater(expert_sum, 0).nonzero()
+    # for expert_idx in expert_hit:
+    for expert_idx in range(self.num_experts):
+        # initial code has a squeeze but it is not possible to do that.
+        # expert_mask_idx = expert_mask[expert_idx].squeeze(0)
+        expert_mask_idx = expert_mask[expert_idx]
+        final_hidden_states = torch.cond(
+            (expert_sum[expert_idx] > 0).item(),
+            lambda final_hidden_states, expert_mask, hidden_states, routing_weights, _i=expert_idx: self._forward_expert_loop(  # noqa: E501
+                final_hidden_states,
+                expert_mask,
+                hidden_states,
+                routing_weights,
+                expert_idx=_i,
+            ),
+            lambda final_hidden_states, *args: final_hidden_states.clone(),
+            [final_hidden_states, expert_mask_idx, hidden_states, routing_weights],
+        )
+
+        # if expert_sum[expert_idx] > 0:
+        #    idx, top_x = torch.where(expert_mask[expert_idx].squeeze(0))
+
+        # Index the correct hidden states and compute the expert hidden state for
+        # the current expert. We need to make sure to multiply the output hidden
+        # states by `routing_weights` on the corresponding tokens (top-1 and top-2)
+        #    current_state = hidden_states[None, top_x].reshape(-1, hidden_dim)
+        #    current_hidden_states = (
+        #        expert_layer(current_state) * routing_weights[top_x, idx, None]
+        #    )
+
+        # However `index_add_` only support torch tensors for indexing so we'll use
+        # the `top_x` tensor here.
+        #    final_hidden_states.index_add_(
+        #        0, top_x, current_hidden_states.to(hidden_states.dtype)
+        #    )
+
+    final_hidden_states = final_hidden_states.reshape(batch_size, sequence_length, hidden_dim)
+    return final_hidden_states, router_logits

auto/patch_transformers: ‘Qwen3MoeSparseMoeBlock_forward_expert_loop’ -> patched_Qwen3MoeSparseMoeBlock._forward_expert_loop¶

def _forward_expert_loop(
    self,
    final_hidden_states,
    expert_mask_idx,
    hidden_states,
    routing_weights,
    expert_idx: int,
):
    # idx, top_x = torch.where(expert_mask_idx.squeeze(0))
    idx, top_x = torch.nonzero(expert_mask_idx, as_tuple=True)
    hidden_dim = hidden_states.shape[-1]
    current_state = hidden_states[None, top_x].reshape(-1, hidden_dim)
    expert_current_state = self.experts[expert_idx](current_state)
    current_hidden_states = expert_current_state * routing_weights[top_x, idx, None]
    return final_hidden_states.index_add(0, top_x, current_hidden_states.to(hidden_states.dtype))

auto/patch_transformers: SamMaskDecoder.forward -> patched_SamMaskDecoder.forward¶

--- original
+++ rewritten
@@ -5,6 +5,7 @@
     sparse_prompt_embeddings: torch.Tensor,
     dense_prompt_embeddings: torch.Tensor,
     multimask_output: bool,
+    output_attentions: Optional[bool] = None,
     attention_similarity: Optional[torch.Tensor] = None,
     target_embedding: Optional[torch.Tensor] = None,
 ) -> tuple[torch.Tensor, torch.Tensor]:
@@ -22,19 +23,31 @@
             the embeddings of the mask inputs
         multimask_output (bool):
             Whether to return multiple masks or a single mask.
+        output_attentions (bool, *optional*):
+            Whether or not to return the attentions tensors of all attention layers.
     """
     batch_size, num_channels, height, width = image_embeddings.shape
-    point_batch_size = (
-        sparse_prompt_embeddings.shape[1] if sparse_prompt_embeddings is not None else 1
-    )
+    point_batch_size = sparse_prompt_embeddings.shape[1]
     # Concatenate output tokens
     output_tokens = torch.cat([self.iou_token.weight, self.mask_tokens.weight], dim=0)
     output_tokens = output_tokens.repeat(batch_size, point_batch_size, 1, 1)

-    if sparse_prompt_embeddings is not None:
-        tokens = torch.cat((output_tokens, sparse_prompt_embeddings), dim=2)
-    else:
-        tokens = output_tokens
+    # torch.cond rewrites the if-else logic to handle empty sparse_prompt_embeddings
+    # torch.any is needed to avoid data-dependent control flow
+    # with sparse_prompt_embeddings.sum().item() != 0
+    def sparse_prompt_embeddings_is_not_empty(output_tokens, sparse_prompt_embeddings):
+        return torch.cat((output_tokens, sparse_prompt_embeddings), dim=2)
+
+    def sparse_prompt_embeddings_is_empty(output_tokens, sparse_prompt_embeddings):
+        return output_tokens.clone()
+
+    tokens = torch.cond(
+        torch.any(sparse_prompt_embeddings != 0),
+        sparse_prompt_embeddings_is_not_empty,
+        sparse_prompt_embeddings_is_empty,
+        [output_tokens, sparse_prompt_embeddings],
+    )
+
     point_embeddings = tokens.to(self.iou_token.weight.dtype)

     # Expand per-image data in batch direction to be per-point
@@ -45,15 +58,21 @@
     )

     # Run the transformer, image_positional_embedding are consumed
-    point_embedding, image_embeddings = self.transformer(
+    torch._check(point_embeddings.shape[0] != 0)
+    torch._check(point_embeddings.shape[1] != 0)
+    torch._check(point_embeddings.shape[2] != 0)
+    torch._check(point_embeddings.shape[3] != 0)
+    embeddings_attentions = self.transformer(
         point_embeddings=point_embeddings,
         image_embeddings=image_embeddings,
         image_positional_embeddings=image_positional_embeddings,
         attention_similarity=attention_similarity,
         target_embedding=target_embedding,
+        output_attentions=output_attentions,
     )
-    iou_token_out = point_embedding[:, :, 0, :]
-    mask_tokens_out = point_embedding[:, :, 1 : (1 + self.num_mask_tokens), :]
+    point_embedding, image_embeddings = embeddings_attentions[:2]
+    iou_token_out = torch.select(point_embedding, dim=2, index=0)
+    mask_tokens_out = torch.narrow(point_embedding, dim=2, start=1, length=self.num_mask_tokens)

     # Upscale mask embeddings and predict masks using the mask tokens
     image_embeddings = image_embeddings.transpose(2, 3).reshape(
@@ -88,4 +107,15 @@
         mask_slice = slice(0, 1)
     masks = masks[:, :, mask_slice, :, :]
     iou_pred = iou_pred[:, :, mask_slice]
-    return masks, iou_pred
+
+    outputs = (masks, iou_pred)
+
+    if len(embeddings_attentions) == 2:
+        # transformers==4.54
+        return outputs
+
+    if output_attentions and len(embeddings_attentions) > 2:
+        outputs = outputs + (embeddings_attentions[2],)  # noqa: RUF005
+    else:
+        outputs = outputs + (None,)  # noqa: RUF005
+    return outputs

auto/patch_transformers: SmolLM3RotaryEmbedding.forward -> common_RotaryEmbedding.forward¶

--- original
+++ rewritten
@@ -1,8 +1,16 @@
-@torch.no_grad()
-@dynamic_rope_update  # power user: used with advanced RoPE types (e.g. dynamic rope)
-def forward(self, x, position_ids):
+@patched_dynamic_rope_update
+def forward(self, x, position_ids, layer_type=None):
+    if layer_type is not None:
+        # transformers>=5.0
+        inv_freq = getattr(self, f"{layer_type}_inv_freq")
+        attention_scaling = getattr(self, f"{layer_type}_attention_scaling")
+    else:
+        # transformers<5.0
+        inv_freq = self.inv_freq
+        attention_scaling = self.attention_scaling
+
     inv_freq_expanded = (
-        self.inv_freq[None, :, None].float().expand(position_ids.shape[0], -1, 1).to(x.device)
+        inv_freq[None, :, None].float().expand(position_ids.shape[0], -1, 1).to(x.device)
     )
     position_ids_expanded = position_ids[:, None, :].float()

@@ -12,7 +20,7 @@
     with torch.autocast(device_type=device_type, enabled=False):  # Force float32
         freqs = (inv_freq_expanded.float() @ position_ids_expanded.float()).transpose(1, 2)
         emb = torch.cat((freqs, freqs), dim=-1)
-        cos = emb.cos() * self.attention_scaling
-        sin = emb.sin() * self.attention_scaling
+        cos = emb.cos() * attention_scaling
+        sin = emb.sin() * attention_scaling

     return cos.to(dtype=x.dtype), sin.to(dtype=x.dtype)

--- original
+++ rewritten
@@ -1,99 +1,193 @@
-def dynamic_rope_update(rope_forward):
-    """
-    Decorator function to update the RoPE parameters in the forward pass, if the model is using a dynamic RoPE
-    (i.e. a RoPE implementation that may recompute its frequencies in the forward pass).
+def patched_dynamic_rope_update(rope_forward):
+    """manual patch: ``[patch:transformers.modeling_rope_utils.dynamic_rope_update]``

-    Args:
-        rope_forward (Callable):
-            The forward pass of the RoPE implementation.
+    ``rope_type`` is determined in the constructor of class
+    :class:`transformers.models.phi3.modeling_phi3.Phi3RotaryEmbedding`.

-    Returns:
-        The decorated forward pass.
+    .. code-block:: python
+
+        if hasattr(config, "rope_scaling") and config.rope_scaling is not None:
+            self.rope_type = config.rope_scaling.get(
+                "rope_type", config.rope_scaling.get("type"))
+        else:
+            self.rope_type = "default"
+
+    The original code of the patched function:
+
+    .. code-block:: python
+
+        def dynamic_rope_update(rope_forward):
+            def longrope_frequency_update(self, position_ids, device):
+                seq_len = torch.max(position_ids) + 1
+                if hasattr(self.config, "original_max_position_embeddings"):
+                    original_max_position_embeddings =
+                        self.config.original_max_position_embeddings
+                else:
+                    original_max_position_embeddings =
+                        self.config.max_position_embeddings
+                if seq_len > original_max_position_embeddings:
+                    if not hasattr(self, "long_inv_freq"):
+                        self.long_inv_freq, _ = self.rope_init_fn(
+                            self.config, device, seq_len=original_max_position_embeddings + 1
+                        )
+                    self.register_buffer("inv_freq", self.long_inv_freq, persistent=False)
+                else:
+                    self.original_inv_freq = self.original_inv_freq.to(device)
+                    self.register_buffer("inv_freq", self.original_inv_freq, persistent=False)
+
+            def dynamic_frequency_update(self, position_ids, device):
+                seq_len = torch.max(position_ids) + 1
+                if seq_len > self.max_seq_len_cached:  # growth
+                    inv_freq, self.attention_scaling = self.rope_init_fn(
+                        self.config, device, seq_len=seq_len)
+                    self.register_buffer("inv_freq", inv_freq, persistent=False)
+                    self.max_seq_len_cached = seq_len
+
+                if seq_len < self.original_max_seq_len and
+                        self.max_seq_len_cached > self.original_max_seq_len:
+                    self.original_inv_freq = self.original_inv_freq.to(device)
+                    self.register_buffer("inv_freq", self.original_inv_freq, persistent=False)
+                    self.max_seq_len_cached = self.original_max_seq_len
+
+            @wraps(rope_forward)
+            def wrapper(self, x, position_ids):
+                if "dynamic" in self.rope_type:
+                    dynamic_frequency_update(self, position_ids, device=x.device)
+                elif self.rope_type == "longrope":
+                    longrope_frequency_update(self, position_ids, device=x.device)
+                return rope_forward(self, x, position_ids)
+
+            return wrapper
+
     """

     def longrope_frequency_update(self, position_ids, device, layer_type=None):
-        """Longrope uses long factor if sequence is larger than original pretraining length, short otherwise."""
+        # It is no use to patch the function after the model is created
+        # as rope_init_fn is an attribute set to one function when the model
+        # is created and when no patch is applied yet.
+        # So we select the patched version here.
+        rope_init_fn = _get_rope_init_fn(self, layer_type=layer_type)
         seq_len = torch.max(position_ids) + 1
-        original_max_position_embeddings = getattr(
-            self.config, "original_max_position_embeddings", self.config.max_position_embeddings
-        )
+        if hasattr(self.config, "original_max_position_embeddings"):
+            original_max_position_embeddings = self.config.original_max_position_embeddings
+        else:
+            original_max_position_embeddings = self.config.max_position_embeddings
+
         if layer_type is None:
-            rope_type = self.rope_type
+            # rope_type = self.rope_type
             original_inv_freq = self.original_inv_freq
             prefix = ""
         else:
-            rope_type = self.rope_type[layer_type]
+            # rope_type = self.rope_type[layer_type]
             original_inv_freq = getattr(self, f"{layer_type}_original_inv_freq")
             prefix = f"{layer_type}_"

-        if seq_len > original_max_position_embeddings:
-            if not hasattr(self, f"{layer_type}_long_inv_freq"):
-                rope_init_fn = ROPE_INIT_FUNCTIONS[rope_type]
-                long_inv_freq, _ = rope_init_fn(
-                    self.config,
-                    device,
-                    seq_len=original_max_position_embeddings + 1,
-                    layer_type=layer_type,
-                )
-            self.register_buffer(f"{prefix}inv_freq", long_inv_freq, persistent=False)
-            setattr(self, f"{prefix}long_inv_freq", long_inv_freq)
-        else:
-            # This .to() is needed if the model has been moved to a device after being initialized (because
-            # the buffer is automatically moved, but not the original copy)
-            original_inv_freq = original_inv_freq.to(device)
-            self.register_buffer(f"{prefix}inv_freq", original_inv_freq, persistent=False)
-            setattr(self, f"{prefix}original_inv_freq", original_inv_freq)
+        # At export time, seq_len is unknown.
+        long_inv_freq, _ = rope_init_fn(
+            self.config, device, seq_len=original_max_position_embeddings + 1
+        )
+        original_inv_freq = self.original_inv_freq.to(device)
+
+        # PATCHED: uses torch.cond instead of a test
+        cond = (seq_len > original_max_position_embeddings).item()
+        inv_freq = torch.cond(
+            cond,
+            (lambda x, y: x.clone()),
+            (lambda x, y: y.clone()),
+            [long_inv_freq, original_inv_freq],
+        )
+        setattr(self, f"{prefix}inv_freq", inv_freq)
+        # if seq_len > original_max_position_embeddings:
+        #    self.inv_freq = self.long_inv_freq
+        # else:
+        #    self.inv_freq = self.original_inv_freq

     def dynamic_frequency_update(self, position_ids, device, layer_type=None):
-        """
-        dynamic RoPE layers should recompute `inv_freq` in the following situations:
-        1 - growing beyond the cached sequence length (allow scaling)
-        2 - the current sequence length is in the original scale (avoid losing precision with small sequences)
-        """
+        # constructor:
+        # - self.max_seq_len_cached = config.max_position_embeddings
+        # - self.original_max_seq_len = config.max_position_embeddings
+        # - inv_freq, self.attention_scaling = self.rope_init_fn(self.config, device)
+
+        # It is no use to patch the function after the model is created
+        # as rope_init_fn is an attribute set to one function when the model
+        # is created and when no patch is applied yet.
+        # So we select the patched version here.
+        rope_init_fn = _get_rope_init_fn(self, layer_type=layer_type)
+
+        # This behaviour is difficult to translate.
+        # The sequence always grows.
+        # The test should always True.
+        # So:  self.max_seq_len_cached = max(self.max_seq_len_cached, seq_len) --> seq_len
+        #
+        # if seq_len > self.max_seq_len_cached:  # growth
+        #    inv_freq, self.attention_scaling = self.rope_init_fn(
+        #        self.config, device, seq_len=seq_len
+        #    )
+        #    self.register_buffer("inv_freq", inv_freq, persistent=False)
+        #    self.max_seq_len_cached = seq_len
+        #
+        # So we should not need what follows.
+        #
+        # cond = (seq_len > self.max_seq_len_cached).item()
+        # self.attention_scaling = torch.cond(
+        #    cond,
+        #    (lambda x, y: x.clone()),
+        #    (lambda x, y: y.clone()),
+        #    [attention_scaling, self.attention_scaling],
+        # )
+
         seq_len = torch.max(position_ids) + 1
+        long_inv_freq, self.attention_scaling = rope_init_fn(self.config, device, seq_len=seq_len)
+
         if layer_type is None:
-            rope_type = self.rope_type
-            max_seq_len_cached = self.max_seq_len_cached
+            # rope_type = self.rope_type
+            # max_seq_len_cached = self.max_seq_len_cached
             original_inv_freq = self.original_inv_freq
             prefix = ""
         else:
-            rope_type = self.rope_type[layer_type]
-            max_seq_len_cached = getattr(
-                self, f"{layer_type}_max_seq_len_cached", self.max_seq_len_cached
-            )
+            # rope_type = self.rope_type[layer_type]
+            # max_seq_len_cached = getattr(
+            #     self, f"{layer_type}_max_seq_len_cached", self.max_seq_len_cached
+            # )
             original_inv_freq = getattr(self, f"{layer_type}_original_inv_freq")
             prefix = f"{layer_type}_"

-        if seq_len > max_seq_len_cached:  # growth
-            rope_init_fn = ROPE_INIT_FUNCTIONS[rope_type]
-            inv_freq, self.attention_scaling = rope_init_fn(
-                self.config,
-                device,
-                seq_len=seq_len,
-                layer_type=layer_type,
-            )
-            # TODO joao: may break with compilation
-            self.register_buffer(f"{prefix}inv_freq", inv_freq, persistent=False)
-            setattr(self, f"{layer_type}_max_seq_len_cached", seq_len)
+        # Second test to translate.
+        # Let's keep in mind, self.max_seq_len_cached = seq_len is likely to be True.
+        # But in that case the following condition is a way to restore the original cache.

-        if (
-            seq_len < self.original_max_seq_len and max_seq_len_cached > self.original_max_seq_len
-        ):  # reset
-            # This .to() is needed if the model has been moved to a device after being initialized (because
-            # the buffer is automatically moved, but not the original copy)
-            original_inv_freq = original_inv_freq.to(device)
-            self.register_buffer(f"{prefix}inv_freq", original_inv_freq, persistent=False)
-            setattr(self, f"{prefix}original_inv_freq", original_inv_freq)
-            setattr(self, f"{layer_type}_max_seq_len_cached", self.original_max_seq_len)
+        # if (
+        #    seq_len < self.original_max_seq_len
+        #    and self.max_seq_len_cached > self.original_max_seq_len
+        # ):
+        #    self.original_inv_freq = self.original_inv_freq.to(device)
+        #    self.register_buffer("inv_freq", self.original_inv_freq, persistent=False)
+        #    self.max_seq_len_cached = self.original_max_seq_len
+
+        original_inv_freq = self.original_inv_freq.to(device)
+        cond = (seq_len >= self.original_max_seq_len).item()
+        # PATCHED: uses torch.cond instead of a test
+        inv_freq = torch.cond(
+            cond,
+            (lambda x, y: x.clone()),
+            (lambda x, y: y.clone()),
+            [long_inv_freq, original_inv_freq],
+        )
+        setattr(self, f"{prefix}inv_freq", inv_freq)

     @wraps(rope_forward)
     def wrapper(self, x, position_ids, layer_type=None):
-        rope_type = self.rope_type if layer_type is None else self.rope_type[layer_type]
-        kwargs = {"layer_type": layer_type} if layer_type is not None else {}
-        if "dynamic" in rope_type:
-            dynamic_frequency_update(self, position_ids, device=x.device, **kwargs)
-        elif rope_type == "longrope":
-            longrope_frequency_update(self, position_ids, device=x.device, **kwargs)
-        return rope_forward(self, x, position_ids, **kwargs)
+        if layer_type is None:
+            if "dynamic" in self.rope_type:
+                dynamic_frequency_update(self, position_ids, device=x.device)
+            elif self.rope_type == "longrope":
+                longrope_frequency_update(self, position_ids, device=x.device)
+            return rope_forward(self, x, position_ids)
+
+        if "dynamic" in self.rope_type:
+            dynamic_frequency_update(self, position_ids, device=x.device, layer_type=layer_type)
+        elif self.rope_type == "longrope":
+            longrope_frequency_update(self, position_ids, device=x.device, layer_type=layer_type)
+        return rope_forward(self, x, position_ids, layer_type=layer_type)

     return wrapper

auto/patch_transformers: VisionAttention.forward -> patched_VisionAttention.forward¶

--- original
+++ rewritten
@@ -3,69 +3,55 @@
     hidden_states: torch.Tensor,
     cu_seqlens: torch.Tensor,
     rotary_pos_emb: Optional[torch.Tensor] = None,
-    position_embeddings: Optional[tuple[torch.Tensor, torch.Tensor]] = None,
-    **kwargs,
+    position_embeddings: Optional[Tuple[torch.Tensor, torch.Tensor]] = None,
 ) -> torch.Tensor:
     seq_length = hidden_states.shape[0]
-    query_states, key_states, value_states = (
+    q, k, v = (
         self.qkv(hidden_states)
         .reshape(seq_length, 3, self.num_heads, -1)
         .permute(1, 0, 2, 3)
         .unbind(0)
     )
-    cos, sin = position_embeddings
-    query_states, key_states = apply_rotary_pos_emb_vision(query_states, key_states, cos, sin)
+    if position_embeddings is None:
+        transformers.models.qwen2_vl.modeling_qwen2_vl.logger.warning_once(
+            "The attention layers in this model are transitioning from "
+            " computing the RoPE embeddings internally "
+            "through `rotary_pos_emb` (2D tensor of RoPE theta values), "
+            "to using externally computed "
+            "`position_embeddings` (Tuple of tensors, containing cos and sin)."
+            " In v4.54 `rotary_pos_emb` will be "
+            "removed and `position_embeddings` will be mandatory."
+        )
+        emb = torch.cat((rotary_pos_emb, rotary_pos_emb), dim=-1)
+        cos = emb.cos()
+        sin = emb.sin()
+    else:
+        cos, sin = position_embeddings
+    q, k = transformers.models.qwen2_vl.modeling_qwen2_vl.apply_rotary_pos_emb_vision(
+        q, k, cos, sin
+    )

-    query_states = query_states.transpose(0, 1).unsqueeze(0)
-    key_states = key_states.transpose(0, 1).unsqueeze(0)
-    value_states = value_states.transpose(0, 1).unsqueeze(0)
+    attention_mask = torch.full(
+        [1, seq_length, seq_length],
+        torch.finfo(q.dtype).min,
+        device=q.device,
+        dtype=q.dtype,
+    )
+    # for i in range(1, len(cu_seqlens)):
+    #     attention_mask[..., cu_seqlens[i - 1] : cu_seqlens[i],
+    #                         cu_seqlens[i - 1] : cu_seqlens[i]] = 0
+    attention_mask = rewrite_loop_for_square_mask(attention_mask, cu_seqlens)

-    attention_interface: Callable = eager_attention_forward
-    if self.config._attn_implementation != "eager":
-        attention_interface = ALL_ATTENTION_FUNCTIONS[self.config._attn_implementation]
-
-    if self.config._attn_implementation == "flash_attention_2":
-        # Flash Attention 2: Use cu_seqlens for variable length attention
-        max_seqlen = (cu_seqlens[1:] - cu_seqlens[:-1]).max()
-        attn_output, _ = attention_interface(
-            self,
-            query_states,
-            key_states,
-            value_states,
-            attention_mask=None,
-            scaling=self.scaling,
-            dropout=0.0 if not self.training else self.attention_dropout,
-            cu_seq_lens_q=cu_seqlens,
-            cu_seq_lens_k=cu_seqlens,
-            max_length_q=max_seqlen,
-            max_length_k=max_seqlen,
-            is_causal=False,
-            **kwargs,
-        )
-    else:
-        # Other implementations: Process each chunk separately
-        lengths = cu_seqlens[1:] - cu_seqlens[:-1]
-        splits = [
-            torch.split(tensor, lengths.tolist(), dim=2)
-            for tensor in (query_states, key_states, value_states)
-        ]
-
-        attn_outputs = [
-            attention_interface(
-                self,
-                q,
-                k,
-                v,
-                attention_mask=None,
-                scaling=self.scaling,
-                dropout=0.0 if not self.training else self.attention_dropout,
-                is_causal=False,
-                **kwargs,
-            )[0]
-            for q, k, v in zip(*splits)
-        ]
-        attn_output = torch.cat(attn_outputs, dim=1)
-
-    attn_output = attn_output.reshape(seq_length, -1).contiguous()
+    q = q.transpose(0, 1)
+    k = k.transpose(0, 1)
+    v = v.transpose(0, 1)
+    attn_weights = torch.matmul(q, k.transpose(1, 2)) / math.sqrt(self.head_dim)
+    attn_weights = attn_weights + attention_mask
+    attn_weights = torch.nn.functional.softmax(attn_weights, dim=-1, dtype=torch.float32).to(
+        q.dtype
+    )
+    attn_output = torch.matmul(attn_weights, v)
+    attn_output = attn_output.transpose(0, 1)
+    attn_output = attn_output.reshape(seq_length, -1)
     attn_output = self.proj(attn_output)
     return attn_output

auto/patch_transformers: eager_attention_forward -> patched_model_bart_eager_attention_forward¶

--- original
+++ rewritten
@@ -1,27 +1,23 @@
-def eager_attention_forward(
-    module: nn.Module,
+def patched_model_bart_eager_attention_forward(
+    module: torch.nn.Module,
     query: torch.Tensor,
     key: torch.Tensor,
     value: torch.Tensor,
     attention_mask: Optional[torch.Tensor],
     scaling: Optional[float] = None,
     dropout: float = 0.0,
-    **kwargs: Unpack[TransformersKwargs],
+    head_mask: Optional[torch.Tensor] = None,
+    **kwargs,
 ):
-    if scaling is None:
-        scaling = query.size(-1) ** -0.5
-
-    # Take the dot product between "query" and "key" to get the raw attention scores.
-    attn_weights = torch.matmul(query, key.transpose(2, 3)) * scaling
-
-    if attention_mask is not None:
-        attention_mask = attention_mask[:, :, :, : key.shape[-2]]
-        attn_weights = attn_weights + attention_mask
-
-    attn_weights = nn.functional.softmax(attn_weights, dim=-1)
-    attn_weights = nn.functional.dropout(attn_weights, p=dropout, training=module.training)
-
-    attn_output = torch.matmul(attn_weights, value)
-    attn_output = attn_output.transpose(1, 2).contiguous()
-
-    return attn_output, attn_weights
+    """[patch:transformers.models.bart.modeling_bart.eager_attention_forward]"""
+    return common_eager_attention_forward(
+        module,
+        query,
+        key,
+        value,
+        attention_mask=attention_mask,
+        scaling=scaling,
+        dropout=dropout,
+        head_mask=head_mask,
+        **kwargs,
+    )

auto/patch_transformers: eager_attention_forward -> patched_modeling_marian_eager_attention_forward¶

--- original
+++ rewritten
@@ -1,27 +1,23 @@
-def eager_attention_forward(
-    module: nn.Module,
+def patched_modeling_marian_eager_attention_forward(
+    module: torch.nn.Module,
     query: torch.Tensor,
     key: torch.Tensor,
     value: torch.Tensor,
     attention_mask: Optional[torch.Tensor],
     scaling: Optional[float] = None,
     dropout: float = 0.0,
-    **kwargs: Unpack[TransformersKwargs],
+    head_mask: Optional[torch.Tensor] = None,
+    **kwargs,
 ):
-    if scaling is None:
-        scaling = query.size(-1) ** -0.5
-
-    # Take the dot product between "query" and "key" to get the raw attention scores.
-    attn_weights = torch.matmul(query, key.transpose(2, 3)) * scaling
-
-    if attention_mask is not None:
-        attention_mask = attention_mask[:, :, :, : key.shape[-2]]
-        attn_weights = attn_weights + attention_mask
-
-    attn_weights = nn.functional.softmax(attn_weights, dim=-1)
-    attn_weights = nn.functional.dropout(attn_weights, p=dropout, training=module.training)
-
-    attn_output = torch.matmul(attn_weights, value)
-    attn_output = attn_output.transpose(1, 2).contiguous()
-
-    return attn_output, attn_weights
+    """[patch:transformers.models.marian.modeling_marian.eager_attention_forward]"""
+    return common_eager_attention_forward(
+        module,
+        query,
+        key,
+        value,
+        attention_mask=attention_mask,
+        scaling=scaling,
+        dropout=dropout,
+        head_mask=head_mask,
+        **kwargs,
+    )