maxtext.configs package#
Submodules#
- maxtext.configs.pyconfig module
- maxtext.configs.pyconfig_deprecated module
yaml_key_to_env_key()string_to_bool()validate_compute_axis_order()validate_shard_mode()validate_kv_quant_axis()validate_attention_kernel()validate_attention_type()validate_moba_attention()validate_attention_window_params()validate_profiler_type()validate_periodic_profiler()validate_model_call_mode()validate_prefill_and_target_lengths()validate_rope_type()validate_expert_shard_attention_option()validate_vocab_tiling()validate_rampup_batch_size()validate_context_parallel_strategy_ring()validate_keys()validate_tokenizer()validate_constant_bound()validate_quantization_methods()validate_tokamax_usage()validate_data_input()validate_llama4_config()validate_model_name()validate_multimodal_model_name()validate_no_keys_overwritten_twice()validate_and_assign_remat_tensors()resolve_config_path()create_parallelisms_list()set_mu_dtype()validate_and_set_hlo_dump_defaults()validate_multiple_slices()set_and_validate_pipeline_config()validate_deepseek_moe()validate_mlp_dim()validate_gpt_oss_moe()validate_sparse_matmul_parallelism()validate_ring_of_experts_parallelism()validate_shard_expert_on_fsdp()validate_ragged_dot()validate_optimizer_sharding_over_data()create_new_logical_axis_rules()update_model_keys()validate_and_update_keys()get_individual_scales()calculate_global_batch_sizes()calculate_rampup_samples_and_steps()get_num_target_devices()get_quantization_local_shard_count()get_context_parallel_size()using_pipeline_parallelism()using_tensor_parallelism()using_sequence_parallelism()using_expert_parallelism()using_fsdp_and_transpose_parallelism()HyperParametersinitialize()
- maxtext.configs.types module
XProfTPUPowerTraceModeDTypeMatmulPrecisionQuantizationTypeQuantizationType.NONEQuantizationType.INT4QuantizationType.INT8QuantizationType.INTMPQuantizationType.FP8QuantizationType.NANOO_FP8QuantizationType.FP8_NANO_V2QuantizationType.FP8_GPUQuantizationType.FP8_FULLQuantizationType.TE_FP8_DSQuantizationType.TE_FP8_CSQuantizationType.TE_MXFP8QuantizationType.TE_NVFP4QuantizationType.TE_NVFP4_NO_RHT
KvQuantAxisRematPolicyRematLocationOptimizerTypeLearningRateScheduleTypeWsdDecayStyleRopeTypeTokenizerTypeDatasetTypeSamplingStrategyProfilerTypeRunInfoCheckpointingCheckpointing.load_parameters_pathCheckpointing.lora_input_adapters_pathCheckpointing.load_full_state_pathCheckpointing.enable_checkpointingCheckpointing.load_checkpoint_only_onceCheckpointing.async_checkpointingCheckpointing.checkpoint_periodCheckpointing.max_num_checkpoints_to_keepCheckpointing.enable_single_replica_ckpt_restoringCheckpointing.checkpoint_todelete_subdirCheckpointing.checkpoint_todelete_full_pathCheckpointing.force_unrollCheckpointing.checkpoint_is_quantizedCheckpointing.save_quantized_params_pathCheckpointing.enable_orbax_v1Checkpointing.checkpoint_conversion_fnCheckpointing.source_checkpoint_layoutCheckpointing.save_checkpoint_on_completionCheckpointing.enable_continuous_checkpointingCheckpointing.colocated_python_checkpointingCheckpointing.enable_autocheckpointCheckpointing.model_config
OrbaxStorageEmergencyCheckpointingEmergencyCheckpointing.enable_multi_tier_checkpointingEmergencyCheckpointing.local_checkpoint_directoryEmergencyCheckpointing.local_checkpoint_periodEmergencyCheckpointing.multi_tier_checkpointing_backup_interval_minutesEmergencyCheckpointing.mtc_data_parallelismEmergencyCheckpointing.enable_emergency_checkpointEmergencyCheckpointing.use_replicator_serviceEmergencyCheckpointing.replicator_backup_interval_minutesEmergencyCheckpointing.model_config
DataTypesQuantizationQuantization.quantizationQuantization.replicate_quant_scaleQuantization.quant_cfg_pathQuantization.quantize_kvcacheQuantization.kv_quant_axisQuantization.kv_quant_dtypeQuantization.quantization_local_shard_countQuantization.use_qwix_quantizationQuantization.use_manual_quantizationQuantization.weight_quantization_calibration_methodQuantization.act_quantization_calibration_methodQuantization.bwd_quantization_calibration_methodQuantization.weight_sparsity_nQuantization.weight_sparsity_mQuantization.weight_sparsity_update_stepQuantization.weight_sparsity_start_stepQuantization.model_config
ModelArchitectureModelArchitecture.decoder_blockModelArchitecture.global_parameter_scaleModelArchitecture.base_emb_dimModelArchitecture.base_num_query_headsModelArchitecture.base_num_kv_headsModelArchitecture.base_mlp_dimModelArchitecture.dense_init_scaleModelArchitecture.base_num_decoder_layersModelArchitecture.head_dimModelArchitecture.attention_output_dimModelArchitecture.global_head_dimModelArchitecture.mlp_activationsModelArchitecture.mlp_activations_limitModelArchitecture.normalization_layer_epsilonModelArchitecture.fused_qkvModelArchitecture.attention_biasModelArchitecture.fused_mlpModelArchitecture.qk_norm_with_scaleModelArchitecture.v_norm_with_scaleModelArchitecture.model_config
MTPLogitsAttentionAttention.attentionAttention.attention_typeAttention.share_kv_projectionsAttention.global_num_kv_headsAttention.attention_sinkAttention.float32_qk_productAttention.float32_logitsAttention.sliding_window_sizeAttention.chunk_attn_window_sizeAttention.attn_logits_soft_capAttention.use_post_attn_normAttention.use_post_ffw_normAttention.use_ragged_attentionAttention.use_tokamax_gmmAttention.ragged_block_sizeAttention.enable_padding_causal_maskAttention.use_tokamax_splashAttention.use_jax_splashAttention.force_q_layoutAttention.use_qk_clipAttention.qk_clip_thresholdAttention.model_config
MoBaMlaAttentionAttentionIndexerLlama4AttentionSplashAttentionSplashAttention.sa_block_qSplashAttention.sa_block_kvSplashAttention.sa_block_kv_computeSplashAttention.sa_block_q_dkvSplashAttention.sa_block_kv_dkvSplashAttention.sa_block_kv_dkv_computeSplashAttention.sa_block_q_dqSplashAttention.sa_block_kv_dqSplashAttention.sa_use_fused_bwd_kernelSplashAttention.sa_q_layoutSplashAttention.sa_k_layoutSplashAttention.sa_v_layoutSplashAttention.use_max_logit_estimateSplashAttention.cost_estimate_flops_fwdSplashAttention.cost_estimate_flops_bwdSplashAttention.dq_reduction_stepsSplashAttention.use_splash_schedulerSplashAttention.model_config
PagedAttentionMoEGeneralMoEGeneral.num_expertsMoEGeneral.num_experts_per_tokMoEGeneral.capacity_factorMoEGeneral.ragged_buffer_factorMoEGeneral.moe_expert_input_dimMoEGeneral.base_moe_mlp_dimMoEGeneral.padded_base_moe_mlp_dimMoEGeneral.load_balance_loss_weightMoEGeneral.use_custom_sort_vjpMoEGeneral.use_ring_of_expertsMoEGeneral.use_gather_mosaic_kernelMoEGeneral.use_random_routingMoEGeneral.interleave_moe_layer_stepMoEGeneral.moe_fsdp_use_two_stage_all_gatherMoEGeneral.shard_exp_on_fsdpMoEGeneral.use_2d_fsdp_shardingMoEGeneral.norm_topk_probMoEGeneral.float32_weight_sumMoEGeneral.float32_gate_logitsMoEGeneral.prefuse_moe_weightsMoEGeneral.model_config
MoEKernelsMoEKernels.megabloxMoEKernels.sparse_matmulMoEKernels.wi_tile_fwd_batch_seqMoEKernels.wi_tile_fwd_embed_dimMoEKernels.wi_tile_fwd_mlp_dimMoEKernels.wi_tile_dlhs_batch_seqMoEKernels.wi_tile_dlhs_embed_dimMoEKernels.wi_tile_dlhs_mlp_dimMoEKernels.wi_tile_drhs_batch_seqMoEKernels.wi_tile_drhs_embed_dimMoEKernels.wi_tile_drhs_mlp_dimMoEKernels.wo_tile_fwd_batch_seqMoEKernels.wo_tile_fwd_embed_dimMoEKernels.wo_tile_fwd_mlp_dimMoEKernels.wo_tile_dlhs_batch_seqMoEKernels.wo_tile_dlhs_embed_dimMoEKernels.wo_tile_dlhs_mlp_dimMoEKernels.wo_tile_drhs_batch_seqMoEKernels.wo_tile_drhs_embed_dimMoEKernels.wo_tile_drhs_mlp_dimMoEKernels.merge_gating_gmmMoEKernels.model_config
DeepSeekMoEDeepSeekMoE.first_num_dense_layersDeepSeekMoE.shared_expertsDeepSeekMoE.routed_scaling_factorDeepSeekMoE.routed_score_funcDeepSeekMoE.routed_biasDeepSeekMoE.routed_bias_update_rateDeepSeekMoE.mlp_biasDeepSeekMoE.n_routing_groupsDeepSeekMoE.topk_routing_groupDeepSeekMoE.use_batch_split_scheduleDeepSeekMoE.batch_split_factorDeepSeekMoE.model_config
Qwen3NextHardwareAndMeshHardwareAndMesh.hardwareHardwareAndMesh.num_slicesHardwareAndMesh.mesh_axesHardwareAndMesh.shard_modeHardwareAndMesh.inhomogeneous_layer_cycle_intervalHardwareAndMesh.scan_layersHardwareAndMesh.param_scan_axisHardwareAndMesh.context_parallel_load_balanceHardwareAndMesh.context_parallel_strategyHardwareAndMesh.context_parallel_reorder_strategyHardwareAndMesh.custom_meshHardwareAndMesh.custom_mesh_and_ruleHardwareAndMesh.allow_split_physical_axesHardwareAndMesh.enable_nnxHardwareAndMesh.optimize_mesh_for_tpu_v6eHardwareAndMesh.shardyHardwareAndMesh.pure_nnx_decoderHardwareAndMesh.pure_nnxHardwareAndMesh.remove_size_one_mesh_axis_from_typeHardwareAndMesh.model_config
LayoutAndShardingLayoutAndSharding.logical_axis_rulesLayoutAndSharding.data_shardingLayoutAndSharding.context_shardingLayoutAndSharding.input_data_sharding_logical_axesLayoutAndSharding.sharding_toleranceLayoutAndSharding.shard_optimizer_over_dataLayoutAndSharding.internal_compileLayoutAndSharding.internal_compile_num_devicesLayoutAndSharding.compile_xla_flagsLayoutAndSharding.model_config
DcnParallelismDcnParallelism.dcn_diloco_parallelismDcnParallelism.dcn_data_parallelismDcnParallelism.dcn_fsdp_parallelismDcnParallelism.dcn_fsdp_transpose_parallelismDcnParallelism.dcn_sequence_parallelismDcnParallelism.dcn_context_parallelismDcnParallelism.dcn_context_autoregressive_parallelismDcnParallelism.dcn_tensor_parallelismDcnParallelism.dcn_tensor_transpose_parallelismDcnParallelism.dcn_tensor_sequence_parallelismDcnParallelism.dcn_pipeline_parallelismDcnParallelism.dcn_expert_parallelismDcnParallelism.dcn_autoregressive_parallelismDcnParallelism.model_config
IciParallelismIciParallelism.ici_diloco_parallelismIciParallelism.ici_data_parallelismIciParallelism.ici_fsdp_parallelismIciParallelism.ici_fsdp_transpose_parallelismIciParallelism.ici_sequence_parallelismIciParallelism.ici_context_parallelismIciParallelism.ici_context_autoregressive_parallelismIciParallelism.ici_tensor_parallelismIciParallelism.ici_tensor_transpose_parallelismIciParallelism.ici_tensor_sequence_parallelismIciParallelism.ici_autoregressive_parallelismIciParallelism.ici_pipeline_parallelismIciParallelism.ici_expert_parallelismIciParallelism.model_config
PipelineParallelismPipelineParallelism.pipeline_fsdp_ag_per_repeatPipelineParallelism.num_layers_per_pipeline_stagePipelineParallelism.num_pipeline_repeatsPipelineParallelism.pipeline_parallel_layersPipelineParallelism.num_pipeline_microbatchesPipelineParallelism.pipeline_delay_activation_forwardingPipelineParallelism.pipeline_fsdp_ag_oncePipelineParallelism.scan_pipeline_iterationsPipelineParallelism.scan_pipeline_repeatsPipelineParallelism.scan_layers_per_stagePipelineParallelism.set_remat_policy_on_pipeline_iterationsPipelineParallelism.set_remat_policy_on_layers_per_stagePipelineParallelism.model_config
RematAndOffloadRematAndOffload.remat_policyRematAndOffload.remat_policy_for_vitRematAndOffload.decoder_layer_inputRematAndOffload.contextRematAndOffload.mlpwiRematAndOffload.mlpwi_0RematAndOffload.mlpwi_1RematAndOffload.mlpwoRematAndOffload.moe_mlpwi_0RematAndOffload.moe_mlpwi_1RematAndOffload.moe_mlpwoRematAndOffload.query_projRematAndOffload.key_projRematAndOffload.value_projRematAndOffload.query_wa_projRematAndOffload.kv_wa_projRematAndOffload.qkv_projRematAndOffload.out_projRematAndOffload.mla_qRematAndOffload.mla_kvRematAndOffload.attention_outRematAndOffload.engramRematAndOffload.optimizer_memory_host_offloadRematAndOffload.parameter_memory_host_offloadRematAndOffload.model_config
TokenizerTokenizer.vocab_sizeTokenizer.tokenizer_pathTokenizer.tokenizer_typeTokenizer.use_chat_templateTokenizer.chat_template_pathTokenizer.chat_templateTokenizer.tokenize_train_dataTokenizer.tokenize_eval_dataTokenizer.add_bosTokenizer.add_eosTokenizer.use_truncationTokenizer.num_vocab_tilingTokenizer.model_config
DatasetGeneralDatasetGeneral.dataset_typeDatasetGeneral.per_device_batch_sizeDatasetGeneral.eval_per_device_batch_sizeDatasetGeneral.max_corpus_charsDatasetGeneral.train_data_columnsDatasetGeneral.train_image_columnDatasetGeneral.eval_data_columnsDatasetGeneral.eval_image_columnDatasetGeneral.packingDatasetGeneral.grain_packing_typeDatasetGeneral.max_segments_per_seqDatasetGeneral.num_epochDatasetGeneral.expansion_factor_real_dataDatasetGeneral.reuse_example_batchDatasetGeneral.generate_padding_batch_trainDatasetGeneral.generate_padding_batch_evalDatasetGeneral.enable_rampup_batch_sizeDatasetGeneral.per_device_batch_size_startDatasetGeneral.per_device_batch_size_incrementDatasetGeneral.global_rampup_samplesDatasetGeneral.colocated_python_data_inputDatasetGeneral.model_config
TfdsDatasetHfDatasetGrainDatasetGrainDataset.grain_train_filesGrainDataset.grain_eval_filesGrainDataset.grain_train_mixture_config_pathGrainDataset.grain_file_typeGrainDataset.grain_use_elastic_iteratorGrainDataset.grain_worker_countGrainDataset.grain_per_worker_buffer_sizeGrainDataset.grain_worker_count_evalGrainDataset.grain_per_worker_buffer_size_evalGrainDataset.grain_ram_budget_mbGrainDataset.grain_num_threadsGrainDataset.grain_prefetch_buffer_sizeGrainDataset.grain_num_threads_evalGrainDataset.grain_prefetch_buffer_size_evalGrainDataset.grain_data_source_max_workersGrainDataset.grain_shuffle_buffer_sizeGrainDataset.model_config
OlmoGrainDatasetFineTuningDistillationDistillation.student_overridesDistillation.teacher_overridesDistillation.offline_data_dirDistillation.distill_alphaDistillation.distill_temperatureDistillation.distill_betaDistillation.distill_feature_loss_typeDistillation.distill_layer_indicesDistillation.distill_alpha_endDistillation.distill_alpha_scheduleDistillation.distill_temperature_endDistillation.distill_temperature_scheduleDistillation.distill_beta_endDistillation.distill_beta_scheduleDistillation.learn_to_init_modeDistillation.lti_use_general_linear_mapDistillation.distill_weights_copy_mapDistillation.distill_student_weights_share_mapDistillation.student_params_to_updateDistillation.model_config
TrainingLoopTrainingLoop.stepsTrainingLoop.log_periodTrainingLoop.eval_intervalTrainingLoop.eval_stepsTrainingLoop.target_eval_lossTrainingLoop.abort_on_nan_lossTrainingLoop.abort_on_inf_lossTrainingLoop.enable_dropoutTrainingLoop.dropout_rateTrainingLoop.enable_data_shufflingTrainingLoop.data_shuffle_seedTrainingLoop.init_weights_seedTrainingLoop.model_config
ManifoldConstrainedHyperConnectionsDilocoParamsOptimizerOptimizer.opt_typeOptimizer.skip_step_on_spikesOptimizer.skip_step_intervalOptimizer.skip_step_scaling_factorOptimizer.gradient_accumulation_stepsOptimizer.use_tunix_gradient_accumulationOptimizer.gradient_clipping_thresholdOptimizer.learning_rateOptimizer.lr_schedule_typeOptimizer.learning_rate_final_fractionOptimizer.wsd_decay_steps_fractionOptimizer.wsd_decay_styleOptimizer.warmup_steps_fractionOptimizer.learning_rate_schedule_stepsOptimizer.trainable_parameters_maskOptimizer.model_config
AdamWMuonPositionalEmbeddingRopeYarnRopeInferenceGeneralInferenceGeneral.max_target_lengthInferenceGeneral.max_prefill_predict_lengthInferenceGeneral.promptInferenceGeneral.load_from_prefill_dirInferenceGeneral.prefill_cache_dirInferenceGeneral.autoregressive_decode_assertInferenceGeneral.model_call_modeInferenceGeneral.use_chunked_prefillInferenceGeneral.prefill_chunk_sizeInferenceGeneral.enable_model_warmupInferenceGeneral.enable_llm_inference_poolInferenceGeneral.multi_samplingInferenceGeneral.return_log_probInferenceGeneral.model_config
DecodingInferenceLayoutInferenceServerInferenceBenchmarkInferenceBenchmark.inference_microbenchmark_prefill_lengthsInferenceBenchmark.inference_microbenchmark_stagesInferenceBenchmark.inference_microbenchmark_loop_itersInferenceBenchmark.inference_microbenchmark_log_file_pathInferenceBenchmark.inference_microbenchmark_num_samplesInferenceBenchmark.inference_metadata_fileInferenceBenchmark.inference_benchmark_testInferenceBenchmark.model_config
PrefixCachingAOTDevelopmentAndDebuggingDevelopmentAndDebugging.constant_bound_configDevelopmentAndDebugging.jax_cache_dirDevelopmentAndDebugging.jax_distributed_initialization_timeoutDevelopmentAndDebugging.jax_debug_log_modulesDevelopmentAndDebugging.skip_jax_distributed_systemDevelopmentAndDebugging.enable_single_controllerDevelopmentAndDebugging.subslice_shapeDevelopmentAndDebugging.max_checkifyDevelopmentAndDebugging.model_config
ProfilingProfiling.profilerProfiling.upload_all_profiler_resultsProfiling.skip_first_n_steps_for_profilerProfiling.profiler_stepsProfiling.profile_cleanlyProfiling.profile_periodically_periodProfiling.hide_profiler_step_metricProfiling.enable_jax_profilerProfiling.jax_profiler_portProfiling.enable_tpu_profiling_optionsProfiling.tpu_num_chips_to_profile_per_taskProfiling.tpu_num_sparse_cores_to_traceProfiling.tpu_num_sparse_core_tiles_to_traceProfiling.xprof_tpu_power_trace_levelProfiling.xprof_e2e_enable_fw_throttle_eventProfiling.xprof_e2e_enable_fw_power_level_eventProfiling.xprof_e2e_enable_fw_thermal_eventProfiling.profile_power_eventsProfiling.model_config
HloDumpHloDump.dump_hloHloDump.dump_stepHloDump.dump_hlo_local_dirHloDump.dump_hlo_delete_local_afterHloDump.dump_hlo_gcs_dirHloDump.dump_hlo_module_nameHloDump.dump_hlo_local_module_nameHloDump.dump_hlo_xla_flagsHloDump.dump_hlo_upload_allHloDump.dump_jaxprHloDump.dump_jaxpr_local_dirHloDump.dump_jaxpr_delete_local_afterHloDump.dump_jaxpr_gcs_dirHloDump.model_config
StackTraceMetricsManagedMLDiagnosticsGoodputGoodput.enable_goodput_recordingGoodput.monitor_goodputGoodput.goodput_upload_interval_secondsGoodput.enable_pathways_goodputGoodput.monitor_step_time_deviationGoodput.step_deviation_interval_secondsGoodput.enable_gcp_goodput_metricsGoodput.enable_gcp_step_deviation_metricsGoodput.model_config
ElasticTrainingGcpMonitoringTensorboardMultimodalGeneralMultimodalGeneral.use_multimodalMultimodalGeneral.freeze_vision_encoder_paramsMultimodalGeneral.freeze_audio_encoder_paramsMultimodalGeneral.use_audioMultimodalGeneral.image_size_for_vitMultimodalGeneral.image_pathMultimodalGeneral.image_placeholderMultimodalGeneral.posemb_type_for_vitMultimodalGeneral.max_num_images_per_exampleMultimodalGeneral.video_pathMultimodalGeneral.audio_pathMultimodalGeneral.video_placeholderMultimodalGeneral.audio_placeholderMultimodalGeneral.use_audio_in_videoMultimodalGeneral.use_mropeMultimodalGeneral.mrope_sectionMultimodalGeneral.position_id_per_secondsMultimodalGeneral.model_config
VisionTowerVisionTower.hidden_size_for_vitVisionTower.intermediate_size_for_vitVisionTower.num_attention_heads_for_vitVisionTower.num_channels_for_vitVisionTower.tile_size_for_vitVisionTower.patch_size_for_vitVisionTower.conv_stride_for_vitVisionTower.num_hidden_layers_for_vitVisionTower.rope_theta_for_vitVisionTower.vision_output_dim_for_vitVisionTower.spatial_merge_size_for_vitVisionTower.out_hidden_size_for_vitVisionTower.temporal_patch_size_for_vitVisionTower.num_position_embeddings_for_vitVisionTower.deepstack_visual_indexes_for_vitVisionTower.vision_output_lengthVisionTower.model_config
VisionProjectorAudioEncoderAudioEncoder.d_model_for_audioAudioEncoder.encoder_attention_heads_for_audioAudioEncoder.encoder_ffn_dim_for_audioAudioEncoder.encoder_layers_for_audioAudioEncoder.attention_dropout_for_audioAudioEncoder.activation_dropout_for_audioAudioEncoder.activation_function_for_audioAudioEncoder.num_mel_bins_for_audioAudioEncoder.max_source_positions_for_audioAudioEncoder.scale_embedding_for_audioAudioEncoder.n_window_for_audioAudioEncoder.n_window_infer_for_audioAudioEncoder.conv_chunksize_for_audioAudioEncoder.downsample_hidden_size_for_audioAudioEncoder.output_dim_for_audioAudioEncoder.num_conv_layers_for_audioAudioEncoder.max_timescale_for_audioAudioEncoder.max_sample_len_for_audioAudioEncoder.model_config
DebugRLHardwareRLHardware.trainer_devices_fractionRLHardware.sampler_devices_fractionRLHardware.chips_per_vmRLHardware.use_pathwaysRLHardware.num_trainer_slicesRLHardware.num_samplers_slicesRLHardware.rollout_data_parallelismRLHardware.rollout_tensor_parallelismRLHardware.rollout_expert_parallelismRLHardware.model_config
VLLMVLLM.kv_cache_bufferVLLM.hbm_utilization_vllmVLLM.swap_space_vllm_gbVLLM.enable_dp_attentionVLLM.enable_expert_parallelVLLM.async_schedulingVLLM.max_num_batched_tokensVLLM.max_num_seqsVLLM.stop_stringsVLLM.vllm_additional_configVLLM.vllm_hf_overridesVLLM.vllm_hf_config_pathVLLM.use_standalone_converterVLLM.vllm_load_formatVLLM.debug_converterVLLM.gcs_debug_pathVLLM.model_config
RLRLDatasetRLEvaluationRewardReward.reward_exact_answerReward.reward_exact_format_matchReward.reward_white_space_format_matchReward.reward_partial_format_matchReward.reward_ratio_guess_to_answer_highReward.reward_ratio_guess_to_answer_lowReward.penalty_incorrect_formatReward.penalty_incorrect_answerReward.math_verify_timeoutReward.math_verify_num_procsReward.model_config
SpecialTokensEngramDerivedValuesDerivedValues.emb_dimDerivedValues.mlp_dimDerivedValues.moe_mlp_dimDerivedValues.num_decoder_layersDerivedValues.num_kv_headsDerivedValues.num_query_headsDerivedValues.num_diloco_replicasDerivedValues.ici_parallelismDerivedValues.dcn_parallelismDerivedValues.using_pipeline_parallelismDerivedValues.context_parallel_sizeDerivedValues.num_target_devicesDerivedValues.global_batch_size_to_train_onDerivedValues.global_batch_size_to_eval_onDerivedValues.global_batch_size_to_loadDerivedValues.global_batch_size_to_load_evalDerivedValues.micro_batch_size_to_train_onDerivedValues.micro_batch_size_to_eval_onDerivedValues.checkpoint_dirDerivedValues.convert_checkpoint_if_possibleDerivedValues.metrics_dirDerivedValues.tensorboard_dirDerivedValues.managed_mldiagnostics_dirDerivedValues.rampup_end_stepDerivedValues.tensors_on_deviceDerivedValues.tensors_to_offloadDerivedValues.global_batch_size_to_load_startDerivedValues.global_batch_size_to_load_incrementDerivedValues.rampup_samples_per_increment_to_loadDerivedValues.model_config
get_individual_scales()MaxTextConfig