Source code for maxtext.experimental.agent.ckpt_conversion_agent.ground_truth.qwen3

# Copyright 2025 Google LLC
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
#     http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.

"""Qwen3 ckpt conversation agent ground truth hook functions."""

import numpy as np



[docs]
def QWEN3_MAXTEXT_TO_HF_PARAM_HOOK_FN(config, scan_layers=False, saving_to_hf=False):
  """Creates parameter transformation functions for Qwen3.

  This function provides a dictionary of transformation functions (hooks) for
  converting Qwen3 model parameters between MaxText and Hugging Face formats.
  It handles embedding padding and kernel reshaping.

  Args:
    config (dict): Model configuration dictionary, including
      'num_hidden_layers' and optionally 'num_experts'.
    scan_layers (bool, optional): Whether the model uses scanned layers.
      Defaults to False.
    saving_to_hf (bool, optional): The direction of conversion. True for
      MaxText to Hugging Face, False for the reverse. Defaults to False.

  Returns:
    dict: A dictionary mapping MaxText parameter names to their corresponding
      transformation functions.
  """
  n_layers = config["num_hidden_layers"]
  num_experts = config.get("num_experts", 0)

  def pad_embedding_layer(input_tensor, target_shape):
    """Pads or truncates embedding layer to match target vocab size."""
    source_vocab_size = input_tensor.shape[0]
    target_vocab_size = target_shape[0]

    if source_vocab_size == target_vocab_size:
      return input_tensor

    if saving_to_hf:  # MaxText to HF, truncate
      return input_tensor[:target_vocab_size, :]
    else:  # HF to MaxText, pad
      padded_tensor = np.zeros(target_shape, dtype=input_tensor.dtype)
      padded_tensor[:source_vocab_size, :] = input_tensor
      return padded_tensor

  def reshape_kernel(input_tensor, target_shape):
    """Reshapes and transposes kernel weights between MaxText and HF."""
    if saving_to_hf:
      flipped_target_shape = np.flip(np.array(target_shape))
      return input_tensor.reshape(flipped_target_shape).T
    else:
      return input_tensor.T.reshape(target_shape)

  mapping = {
      "params-token_embedder-embedding": pad_embedding_layer,
      "params-decoder-logits_dense-kernel": reshape_kernel,
  }

  kernel_hooks = [
      "self_attention-query-kernel",
      "self_attention-key-kernel",
      "self_attention-value-kernel",
      "self_attention-out-kernel",
      "mlp-wi_0-kernel",
      "mlp-wi_1-kernel",
      "mlp-wo-kernel",
  ]
  moe_kernel_hooks = [
      "moe_block-gate-kernel",
      "moe_block-wi_0-kernel",
      "moe_block-wi_1-kernel",
      "moe_block-wo-kernel",
  ]

  if scan_layers:
    for key in kernel_hooks:
      mapping[f"params-decoder-layers-{key}"] = reshape_kernel
    if num_experts > 1:
      for key in moe_kernel_hooks:
        mapping[f"params-decoder-layers-{key}"] = reshape_kernel
  else:
    for i in range(n_layers):
      for key in kernel_hooks:
        mapping[f"params-decoder-layers_{i}-{key}"] = reshape_kernel
      if num_experts > 1:
        for key in moe_kernel_hooks:
          mapping[f"params-decoder-layers_{i}-{key}"] = reshape_kernel
  return mapping