diff --git a/examples/mcore/qwen3_moe/evaluate_qwen3_235b_a22b_ptd.sh b/examples/mcore/qwen3_moe/evaluate_qwen3_235b_a22b_ptd.sh index 67a4b896475ec97d7e187f8348def76d9c31b6c8..b4c339cd2ee9be0d5443f167404ea0a9ffac06d4 100644 --- a/examples/mcore/qwen3_moe/evaluate_qwen3_235b_a22b_ptd.sh +++ b/examples/mcore/qwen3_moe/evaluate_qwen3_235b_a22b_ptd.sh @@ -57,7 +57,7 @@ GPT_ARGS=" --use-flash-attn \ --reuse-fp32-param \ --hidden-size 4096 \ - --ffn-hidden-size 8192 \ + --ffn-hidden-size 12288 \ --num-attention-heads 64 \ --group-query-attention \ --num-query-groups 4 \ diff --git a/examples/mcore/qwen3_moe/evaluate_qwen3_30b_a3b_ptd.sh b/examples/mcore/qwen3_moe/evaluate_qwen3_30b_a3b_ptd.sh index 049eb88c1eabef1c6f4df50a74bd6a14fd022347..d2d18795a5ed709cc56e8f49e7e05cde91ee6160 100644 --- a/examples/mcore/qwen3_moe/evaluate_qwen3_30b_a3b_ptd.sh +++ b/examples/mcore/qwen3_moe/evaluate_qwen3_30b_a3b_ptd.sh @@ -55,7 +55,7 @@ torchrun $DISTRIBUTED_ARGS evaluation.py \ --expert-model-parallel-size ${EP} \ --num-layers 48 \ --hidden-size 2048 \ - --ffn-hidden-size 8192 \ + --ffn-hidden-size 6144 \ --num-attention-heads 32 \ --group-query-attention \ --num-query-groups 4 \ diff --git a/examples/mcore/qwen3_moe/generate_qwen3_30b_a3b_ptd.sh b/examples/mcore/qwen3_moe/generate_qwen3_30b_a3b_ptd.sh index dbc97fa27750a9a4c242236d14f3ff49f1198bc0..df25bdce68a927aeaf40dabcbc494347c7aae0a6 100644 --- a/examples/mcore/qwen3_moe/generate_qwen3_30b_a3b_ptd.sh +++ b/examples/mcore/qwen3_moe/generate_qwen3_30b_a3b_ptd.sh @@ -57,7 +57,7 @@ torchrun $DISTRIBUTED_ARGS inference.py \ --hidden-size 2048 \ --use-rotary-position-embeddings \ --num-attention-heads 32 \ - --ffn-hidden-size 8192 \ + --ffn-hidden-size 6144 \ --max-position-embeddings 40960 \ --seq-length ${SEQ_LENGTH} \ --make-vocab-size-divisible-by 1 \ diff --git a/examples/mcore/qwen3_moe/tune_qwen3_30b_a3b_4K_full_ptd.sh b/examples/mcore/qwen3_moe/tune_qwen3_30b_a3b_4K_full_ptd.sh index 3eae75d7db39a45c99758175dc46e03b1b272dfe..ec1181ec91215e7a269c170ef42e727c451a1591 100644 --- a/examples/mcore/qwen3_moe/tune_qwen3_30b_a3b_4K_full_ptd.sh +++ b/examples/mcore/qwen3_moe/tune_qwen3_30b_a3b_4K_full_ptd.sh @@ -94,7 +94,7 @@ GPT_ARGS=" --max-position-embeddings ${SEQ_LENGTH} \ --num-layers 48 \ --hidden-size 2048 \ - --ffn-hidden-size 8192 \ + --ffn-hidden-size 6144 \ --num-attention-heads 32 \ --tokenizer-type PretrainedFromHF \ --make-vocab-size-divisible-by 1 \ diff --git a/examples/mcore/qwen3_moe/tune_qwen3_30b_a3b_4K_lora_ptd.sh b/examples/mcore/qwen3_moe/tune_qwen3_30b_a3b_4K_lora_ptd.sh index f87860310aaae15a8e4f0c6c7e6214ea15aaa453..a9ff7c295189ebebf2bf5e679bcd0c0694dc407a 100644 --- a/examples/mcore/qwen3_moe/tune_qwen3_30b_a3b_4K_lora_ptd.sh +++ b/examples/mcore/qwen3_moe/tune_qwen3_30b_a3b_4K_lora_ptd.sh @@ -94,7 +94,7 @@ GPT_ARGS=" --max-position-embeddings ${SEQ_LENGTH} \ --num-layers 48 \ --hidden-size 2048 \ - --ffn-hidden-size 8192 \ + --ffn-hidden-size 6144 \ --num-attention-heads 32 \ --tokenizer-type PretrainedFromHF \ --make-vocab-size-divisible-by 1 \