From 7621f05d3f33518a96427f86f78ad236fff9dd9c Mon Sep 17 00:00:00 2001 From: ViperEkura <3081035982@qq.com> Date: Sun, 17 May 2026 17:07:01 +0800 Subject: [PATCH] =?UTF-8?q?docs:=20AdamW=20beta=20=E9=BB=98=E8=AE=A4?= =?UTF-8?q?=E5=80=BC=E6=94=B9=E4=B8=BA=20(0.9,=200.95)?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit - 与 Muon 优化器的 AdamW 子优化器保持一致 - 同步更新 train.py/training.md/params.md/README --- README.md | 4 ++-- assets/docs/README-zh-CN.md | 4 ++-- assets/docs/params.md | 8 ++++---- assets/docs/training.md | 4 ++-- scripts/tools/train.py | 8 ++++---- 5 files changed, 14 insertions(+), 14 deletions(-) diff --git a/README.md b/README.md index 0589828..5aa72c7 100644 --- a/README.md +++ b/README.md @@ -90,8 +90,8 @@ nohup python scripts/tools/train.py \ --warmup_ratio=0.05 \ --max_lr=1e-4 \ --max_grad_norm=1.0 \ - --adamw_beta1=0.95 \ - --adamw_beta2=0.99 \ + --adamw_beta1=0.9 \ + --adamw_beta2=0.95 \ --adamw_weight_decay=0.01 \ --window_size=2048 \ --ckpt_interval=10000 \ diff --git a/assets/docs/README-zh-CN.md b/assets/docs/README-zh-CN.md index 41d1743..e30e4c4 100644 --- a/assets/docs/README-zh-CN.md +++ b/assets/docs/README-zh-CN.md @@ -96,8 +96,8 @@ nohup python scripts/tools/train.py \ --warmup_ratio=0.05 \ --max_lr=1e-4 \ --max_grad_norm=1.0 \ - --adamw_beta1=0.95 \ - --adamw_beta2=0.99 \ + --adamw_beta1=0.9 \ + --adamw_beta2=0.95 \ --adamw_weight_decay=0.01 \ --window_size=2048 \ --ckpt_interval=10000 \ diff --git a/assets/docs/params.md b/assets/docs/params.md index ccef336..ae86e39 100644 --- a/assets/docs/params.md +++ b/assets/docs/params.md @@ -25,8 +25,8 @@ | Parameter | Description | Default | |-----------|-------------|---------| -| `--adamw_beta1` | AdamW beta1 | 0.95 | -| `--adamw_beta2` | AdamW beta2 | 0.99 | +| `--adamw_beta1` | AdamW beta1 | 0.9 | +| `--adamw_beta2` | AdamW beta2 | 0.95 | | `--adamw_weight_decay` | AdamW weight decay | 0.01 | ### Data Loading @@ -81,8 +81,8 @@ nohup python scripts/tools/train.py \ --warmup_ratio=0.05 \ --max_lr=1e-4 \ --max_grad_norm=1.0 \ - --adamw_beta1=0.95 \ - --adamw_beta2=0.99 \ + --adamw_beta1=0.9 \ + --adamw_beta2=0.95 \ --adamw_weight_decay=0.01 \ --window_size=2048 \ --ckpt_interval=10000 \ diff --git a/assets/docs/training.md b/assets/docs/training.md index a97f485..0fde6e5 100644 --- a/assets/docs/training.md +++ b/assets/docs/training.md @@ -196,8 +196,8 @@ nohup python scripts/tools/train.py \ --warmup_ratio=0.05 \ --max_lr=1e-4 \ --max_grad_norm=1.0 \ - --adamw_beta1=0.95 \ - --adamw_beta2=0.99 \ + --adamw_beta1=0.9 \ + --adamw_beta2=0.95 \ --adamw_weight_decay=0.01 \ --window_size=2048 \ --ckpt_interval=10000 \ diff --git a/scripts/tools/train.py b/scripts/tools/train.py index 1d7c72b..e9cd8df 100644 --- a/scripts/tools/train.py +++ b/scripts/tools/train.py @@ -69,14 +69,14 @@ def parse_args() -> argparse.Namespace: parser.add_argument( "--adamw_beta1", type=float, - default=0.95, - help="Beta values for AdamW optimizer.", + default=0.9, + help="Beta1 for AdamW optimizer.", ) parser.add_argument( "--adamw_beta2", type=float, - default=0.99, - help="Beta values for AdamW optimizer.", + default=0.95, + help="Beta2 for AdamW optimizer.", ) parser.add_argument( "--adamw_weight_decay",