基于Megatron-LM从0到1完成GPT2模型预训练、模型评估及推理-五八三

Ã¨Â¿ÂÃ¨Â¡ÂÃ§ÂÂ¯Ã¥Â¢ÂÃ¦ÂÂÃ¥Â»Âº

Ã¥ÂÂºÃ§Â¡ÂÃ§ÂÂ¯Ã¥Â¢ÂÃ©ÂÂÃ§Â½Â®Ã¥Â¦ÂÃ¤Â¸ÂÃ¯Â¼Â

Ã¦ÂÂÃ¤Â½ÂÃ§Â³Â»Ã§Â»Â: Ubuntu 18.04
CPUs: Ã¥ÂÂÃ¤Â¸ÂªÃ¨ÂÂÃ§ÂÂ¹Ã¥ÂÂ·Ã¦ÂÂ 384GB Ã¥ÂÂÃ¥ÂÂÃ§ÂÂ Intel CPUÃ¯Â¼ÂÃ§ÂÂ©Ã§ÂÂCPUÃ¤Â¸ÂªÃ¦ÂÂ°Ã¤Â¸Âº2Ã¯Â¼ÂÃ¦Â¯ÂÃ©Â¢ÂCPUÃ¦Â Â¸Ã¦ÂÂ°Ã¤Â¸Âº20
GPUs: 4 Ã¥ÂÂ¡ A800 80GB GPUs
Python: 3.8.10
NVIDIAÃ©Â©Â±Ã¥ÂÂ¨Ã§Â¨ÂÃ¥ÂºÂÃ§ÂÂÃ¦ÂÂ¬: 525.105.17Ã¯Â¼ÂÃ¦Â Â¹Ã¦ÂÂ®Ã¤Â¸ÂÃ¥ÂÂÃ¥ÂÂÃ¥ÂÂ·Ã©ÂÂÃ¦ÂÂ©Ã¤Â¸ÂÃ¥ÂÂÃ§ÂÂÃ©Â©Â±Ã¥ÂÂ¨Ã§Â¨ÂÃ¥ÂºÂÃ¯Â¼ÂÃ§ÂÂ¹Ã¥ÂÂ»Ã¤Â¸ÂÃ¨Â½Â½Ã£ÂÂ
CUDAÃ¥Â·Â¥Ã¥ÂÂ·Ã¥ÂÂ: 12.1Ã¯Â¼ÂÃ§ÂÂ¹Ã¥ÂÂ»Ã¤Â¸ÂÃ¨Â½Â½

docker pull nvcr.io/nvidia/pytorch:23.04-py3

docker run -dt --name nvidia_pytorch_env --restart=always --gpus all \
--network=host \
--shm-size 4G \
-v /home/gdong/workspace:/workspace \
-w /workspace \
nvcr.io/nvidia/pytorch:23.04-py3 \
/bin/bash

docker exec -it nvidia_pytorch_env bash

Ã¤Â»Â£Ã§Â ÂÃ¥ÂÂÃ¥Â¤Â

git clone https://github.com/NVIDIA/Megatron-LM.git
cd Megatron-LM
git checkout 992da75

Ã¦Â¨Â¡Ã¥ÂÂÃ¦ÂÂÃ©ÂÂÃ¥ÂÂÃ¨Â¯ÂÃ¨Â¡Â¨Ã¥ÂÂÃ¥Â¤Â

Ã¤Â¸ÂÃ¨Â½Â½GPT2Ã¦ÂÂÃ©ÂÂÃ¯Â¼Â

wget --content-disposition https://api.ngc.nvidia.com/v2/models/nvidia/megatron_lm_345m/versions/v0.0/zip -O megatron_lm_345m_v0.0.zip

> tree -h megatron
megatron
Ã¢ÂÂÃ¢ÂÂÃ¢ÂÂ [   8]  latest_checkpointed_iteration.txt
Ã¢ÂÂÃ¢ÂÂÃ¢ÂÂ [4.0K]  release
    Ã¢ÂÂÃ¢ÂÂÃ¢ÂÂ [4.0K]  mp_rank_00
        Ã¢ÂÂÃ¢ÂÂÃ¢ÂÂ [677M]  model_optim_rng.pt


2 directories, 2 files

> cat megatron/latest_checkpointed_iteration.txt 
release

Ã¤Â¸ÂÃ¨Â½Â½GPT2Ã¨Â¯ÂÃ¨Â¡Â¨Ã¯Â¼Â

https://s3.amazonaws.com/models.huggingface.co/bert/gpt2-vocab.json
https://s3.amazonaws.com/models.huggingface.co/bert/gpt2-merges.txt

Ã¦ÂÂ°Ã¦ÂÂ®Ã©ÂÂÃ¥ÂÂÃ¥Â¤Â

Ã©Â¦ÂÃ¥ÂÂÃ¯Â¼ÂÃ¥Â®ÂÃ¨Â£ÂÃ¤Â¾ÂÃ¨ÂµÂÃ¥ÂºÂÃ£ÂÂ

pip install ftfy langdetect numpy torch pandas nltk sentencepiece boto3 tqdm regex bs4 newspaper3k htmlmin tldextract -i https://pypi.tuna.tsinghua.edu.cn/simple  --trusted-host pypi.tuna.tsinghua.edu.cn

Ã§ÂÂ¶Ã¥ÂÂÃ¯Â¼ÂÃ¥Â®ÂÃ¨Â£ÂLSHÃ£ÂÂ

git clone https://github.com/mattilyra/LSH
cd LSH
git checkout a57069b
python setup.py install

Ã¤Â¿Â®Ã¦ÂÂ¹lsh/cMinhash.cppÃ¦ÂÂÃ¤Â»Â¶Ã¯Â¼Â

Ã¥Â°Âexc_typeÃ¦ÂÂ¹Ã¤Â¸Âºcurexc_type
Ã¥Â°Âexc_valueÃ¦ÂÂ¹Ã¤Â¸Âºcurexc_value
Ã¥Â°Âexc_tracebackÃ¦ÂÂ¹Ã¤Â¸Âºcurexc_traceback

> mkdir urls



> tree -h urls/
urls/
Ã¢ÂÂÃ¢ÂÂÃ¢ÂÂ [5.3M]  RS_2011-01.bz2.deduped.txt


0 directories, 1 file

# python blacklist_urls.py <path to the downloaded deduplicated URLs> <filename for clean urls. e.g. clean_urls.txt>
python3 blacklist_urls.py ./urls clean_urls.txt
# Ã¥ÂÂªÃ¤Â¿ÂÃ¥ÂÂÃ¦Â¸ÂÃ©ÂÂ¤Ã¥ÂÂÃ§ÂÂÃ¥ÂÂ100Ã¤Â¸ÂªURLÃ£ÂÂ
# head -n100 clean_urls.txt >> clean_urls_100.txt

# ef42b51
git clone https://github.com/yet-another-account/openwebtext.git

# vim openwebtext/download.py

python3 openwebtext/download.py ./Megatron-LM/tools/openwebtext/clean_urls.txt  --output_dir /workspace/code/scraped

> tree -h /workspace/code/scraped
/workspace/code/scraped
Ã¢ÂÂÃ¢ÂÂÃ¢ÂÂ [304K]  data
Ã¢ÂÂ   Ã¢ÂÂÃ¢ÂÂÃ¢ÂÂ [ 176]  0000300-ab9ff12f7658b8764a413bf58d58bc48b866b0c163ce5c0442296dce46ff0ff8.txt
Ã¢ÂÂ	Ã¢ÂÂ	...
Ã¢ÂÂ   Ã¢ÂÂÃ¢ÂÂÃ¢ÂÂ [ 634]  0009896-6e15400f49434b3dbf9421a8f342f80f26c1e901f78f6350d4b738f58c456bdd.txt
Ã¢ÂÂÃ¢ÂÂÃ¢ÂÂ [296K]  meta
    Ã¢ÂÂÃ¢ÂÂÃ¢ÂÂ [ 154]  0001000-ab50f2cd5366369108d58d6e4eb77e8c4babf56e634a33dcd880597684109fc4.json
    Ã¢ÂÂ	...
    Ã¢ÂÂÃ¢ÂÂÃ¢ÂÂ [ 224]  0009896-6e15400f49434b3dbf9421a8f342f80f26c1e901f78f6350d4b738f58c456bdd.json

2 directories, 4860 files

Ã¦ÂÂÃ¤Â»Â¶Ã¥ÂÂÃ¥Â®Â¹Ã¥Â¦ÂÃ¤Â¸ÂÃ¯Â¼Â

# meta Ã¥ÂÂÃ¦ÂÂÃ¤Â»Â¶Ã¥Â¤Â¹Ã¥ÂÂÃ¥ÂÂ¨Ã¥ÂÂÃ¦ÂÂ°Ã¦ÂÂ®
> cat /workspace/code/scraped/meta/0009896-6e15400f49434b3dbf9421a8f342f80f26c1e901f78f6350d4b738f58c456bdd.json
{"url": "http://minnesotaindependent.com/74302/bachmann-says-transportation-projects-shouldnt-count-as-earmarks", "word_count": 73, "elapsed": 3.2160894870758057, "scraper": "newspaper", "domain": "minnesotaindependent.com"}


# data Ã¥ÂÂÃ¦ÂÂÃ¤Â»Â¶Ã¥Â¤Â¹Ã¥ÂÂÃ¥ÂÂ¨Ã¦ÂÂÃ¦ÂÂ¬Ã¦ÂÂ°Ã¦ÂÂ®
> cat /workspace/code/scraped/data/0009896-6e15400f49434b3dbf9421a8f342f80f26c1e901f78f6350d4b738f58c456bdd.txt 
Der eigene Bodenwischer ist der wichtigste Begleiter im tÃÂ¤glichen Haushalt. Ob fÃÂ¼r Parkett, Fliesen oder Laminat: QualitÃÂ¤t, Ausstattung und Preis spielen bei der Kaufentscheidung eine groÃÂe Rolle.
...
Bodenwischer fÃÂ¼r Ã¢ÂÂ¦

python3 Megatron-LM/tools/openwebtext/merge_data.py --data_path /workspace/code/scraped/data --output_file /workspace/data/merged_output.json

> head -n6 /workspace/data/merged_output.json
{"text": "With every new year, it's murder for Neal Smither and his crew.\n"}
{"text": "\n"}
{"text": "Suicide, too.\n"}
{"text": "\n"}
{"text": "As owner of Crime Scene Cleaners, Smither's job is to clean up the bloody messes left behind when people kill each other or themselves - and those first few weeks after Jan. 1 are his busiest time of year.\n"}
{"text": "\n"}

Ã¦ÂÂ°Ã¦ÂÂ®Ã¦Â¸ÂÃ¦Â´Â

python3 cleanup_dataset.py /workspace/data/merged_output.json /workspace/data/merged_cleand.json

Ã¦Â¸ÂÃ¦Â´ÂÃ¥ÂÂÃ¥ÂÂÃ¦ÂÂ°Ã¦ÂÂ®Ã¥Â¯Â¹Ã¦Â¯ÂÃ¯Â¼Â

> wc -l merged_output.json 
78802 merged_output.json

> wc -l merged_cleand.json 
2456 merged_cleand.json

shuf /workspace/data/merged_cleand.json -o /workspace/data/train_data.json

Ã¦ÂÂ°Ã¦ÂÂ®Ã©Â¢ÂÃ¥Â¤ÂÃ§ÂÂ

python tools/preprocess_data.py \
       --input /workspace/data/train_data.json \
       --output-prefix /workspace/data/my-gpt2 \
       --vocab-file /workspace/model/gpt2-vocab/gpt2-vocab.json\
       --dataset-impl mmap \
       --tokenizer-type GPT2BPETokenizer \
       --merge-file /workspace/model/gpt2-vocab/gpt2-merges.txt \
       --append-eod \
       --workers 20 \
       --chunk-size 25

Ã¨Â¾ÂÃ¥ÂÂºÃ¦ÂÂÃ¤Â»Â¶Ã¥ÂÂÃ¤Â¸Âº my-gpt2_text_document.bin Ã¥ÂÂ my-gpt2_text_document.idxÃ£ÂÂÃ¥ÂÂ¨ GPT2 Ã¨Â®ÂÃ§Â»ÂÃ¦ÂÂ¶Ã¯Â¼ÂÃ¤Â½Â¿Ã§ÂÂ¨Ã¤Â¸ÂÃ¥Â¸Â¦Ã¦ÂÂ©Ã¥Â±ÂÃ¥ÂÂÃ§ÂÂÃ¥ÂÂÃ§Â§Â°Ã¤Â½ÂÃ¤Â¸Âº --data-pathÃ£ÂÂ

Ã¦Â¨Â¡Ã¥ÂÂÃ¨Â®ÂÃ§Â»Â

Ã¥ÂÂÃ¥ÂÂ¡Ã¨Â®ÂÃ§Â»Â

#!/bin/bash



# Runs the "345M" parameter model


export CUDA_DEVICE_MAX_CONNECTIONS=1


CHECKPOINT_PATH=/workspace/model/megatron-models/345m
VOCAB_FILE=/workspace/model/gpt2-vocab/gpt2-vocab.json
MERGE_FILE=/workspace/model/gpt2-vocab/gpt2-merges.txt
DATA_PATH=/workspace/data/my-gpt2_text_document
MODEL_PATH=/workspace/model/megatron-models/output


# Ã¦Â¨Â¡Ã¥ÂÂÃ¨Â¶ÂÃ¥ÂÂÃ¦ÂÂ°
GPT_ARGS="
    --num-layers 24 \
    --hidden-size 1024 \
    --num-attention-heads 16 \
    --seq-length 1024 \
    --max-position-embeddings 1024 \
    --micro-batch-size 1 \
    --global-batch-size 2 \
    --lr 0.00015 \
    --train-iters 5000 \
    --lr-decay-iters 320000 \
    --lr-decay-style cosine \
    --min-lr 1.0e-5 \
    --weight-decay 1e-2 \
    --lr-warmup-fraction .01 \
    --clip-grad 1.0 \
    --fp16
"

# Ã¦ÂÂ°Ã¦ÂÂ®Ã©ÂÂÃ¥ÂÂÃ¨Â¯ÂÃ¨Â¡Â¨Ã¨Â·Â¯Ã¥Â¾ÂÃ¥ÂÂÃ¦ÂÂ°
DATA_ARGS="
    --data-path $DATA_PATH \
    --vocab-file $VOCAB_FILE \
    --merge-file $MERGE_FILE \
    --data-impl mmap \
    --split 700,200,100
"

# Ã¦Â¨Â¡Ã¥ÂÂÃ¦ÂÂÃ©ÂÂÃ¨Â¾ÂÃ¥ÂÂºÃ£ÂÂÃ¨Â¯ÂÃ¤Â¼Â°Ã£ÂÂÃ¦ÂÂ¥Ã¥Â¿ÂÃ§ÂÂ¸Ã¥ÂÂ³Ã§ÂÂÃ¥ÂÂÃ¦ÂÂ°
OUTPUT_ARGS="
    --log-interval 100 \
    --save-interval 10000 \
    --eval-interval 1000 \
    --eval-iters 10
"

# Ã¥ÂÂ¯Ã¥ÂÂ¨Ã¨Â®ÂÃ§Â»ÂÃ¤Â»Â»Ã¥ÂÂ¡
torchrun pretrain_gpt.py \
    $GPT_ARGS \
    $DATA_ARGS \
    $OUTPUT_ARGS \
    --save $MODEL_PATH \
    --load $CHECKPOINT_PATH

CUDA_VISIBLE_DEVICES=3 sh examples/pretrain_gpt.sh

> tree -h 345m
345m
Ã¢ÂÂÃ¢ÂÂÃ¢ÂÂ [4.0K]  iter_0005000

Ã¢ÂÂ   Ã¢ÂÂÃ¢ÂÂÃ¢ÂÂ [4.0K]  mp_rank_00
Ã¢ÂÂ       Ã¢ÂÂÃ¢ÂÂÃ¢ÂÂ [4.6G]  model_optim_rng.pt
Ã¢ÂÂÃ¢ÂÂÃ¢ÂÂ [   4]  latest_checkpointed_iteration.txt


> cat 345m/latest_checkpointed_iteration.txt 
5000

Ã¦ÂÂ°Ã¦ÂÂ®Ã¥Â¹Â¶Ã¨Â¡ÂÃ¨Â®ÂÃ§Â»ÂÃ¯Â¼Â4DPÃ¯Â¼Â

tree -h /workspace/model/megatron-models/345m-init-4tp

/workspace/model/megatron-models/345m-init-4tp

Ã¢ÂÂÃ¢ÂÂÃ¢ÂÂ [4.0K]  iter_0002000


Ã¢ÂÂ   Ã¢ÂÂÃ¢ÂÂÃ¢ÂÂ [4.0K]  mp_rank_00

Ã¢ÂÂ   Ã¢ÂÂ   Ã¢ÂÂÃ¢ÂÂÃ¢ÂÂ [1.2G]  model_optim_rng.pt

...

Ã¢ÂÂ   Ã¢ÂÂÃ¢ÂÂÃ¢ÂÂ [4.0K]  mp_rank_03

Ã¢ÂÂ       Ã¢ÂÂÃ¢ÂÂÃ¢ÂÂ [1.2G]  model_optim_rng.pt

Ã¢ÂÂÃ¢ÂÂÃ¢ÂÂ [   4]  latest_checkpointed_iteration.txt



10 directories, 9 files


> cat /workspace/model/megatron-models/345m-init-4tp/latest_checkpointed_iteration.txt 
2000


+-----------------------------------------------------------------------------+
| Processes:                                                                  |
|  GPU   GI   CI        PID   Type   Process name                  GPU Memory |
|        ID   ID                                                   Usage      |
|=============================================================================|
|    0   N/A  N/A   3227288      C   /usr/bin/python                  9652MiB |
|    1   N/A  N/A   3227289      C   /usr/bin/python                  9652MiB |
|    2   N/A  N/A   3227290      C   /usr/bin/python                  9652MiB |
|    3   N/A  N/A   3227291      C   /usr/bin/python                  9652MiB |
+-----------------------------------------------------------------------------+

Ã¦Â¨Â¡Ã¥ÂÂÃ¥Â¹Â¶Ã¨Â¡ÂÃ¨Â®ÂÃ§Â»ÂÃ¯Â¼Â4PPÃ¯Â¼Â

> tree -h /workspace/model/megatron-models/345m-init-4pp
/workspace/model/megatron-models/345m-init-4pp
Ã¢ÂÂÃ¢ÂÂÃ¢ÂÂ [4.0K]  iter_0002000


Ã¢ÂÂ   Ã¢ÂÂÃ¢ÂÂÃ¢ÂÂ [4.0K]  mp_rank_00_000

Ã¢ÂÂ   Ã¢ÂÂ   Ã¢ÂÂÃ¢ÂÂÃ¢ÂÂ [1.7G]  model_optim_rng.pt
Ã¢ÂÂ   Ã¢ÂÂÃ¢ÂÂÃ¢ÂÂ [4.0K]  mp_rank_00_001

Ã¢ÂÂ   Ã¢ÂÂ   Ã¢ÂÂÃ¢ÂÂÃ¢ÂÂ [1009M]  model_optim_rng.pt
Ã¢ÂÂ   Ã¢ÂÂÃ¢ÂÂÃ¢ÂÂ [4.0K]  mp_rank_00_002
Ã¢ÂÂ   Ã¢ÂÂ   Ã¢ÂÂÃ¢ÂÂÃ¢ÂÂ [1009M]  model_optim_rng.pt
Ã¢ÂÂ   Ã¢ÂÂÃ¢ÂÂÃ¢ÂÂ [4.0K]  mp_rank_00_003
Ã¢ÂÂ       Ã¢ÂÂÃ¢ÂÂÃ¢ÂÂ [1.7G]  model_optim_rng.pt
Ã¢ÂÂÃ¢ÂÂÃ¢ÂÂ [   4]  latest_checkpointed_iteration.txt


> cat /workspace/model/megatron-models/345m-init-4pp/latest_checkpointed_iteration.txt 
2000

+-----------------------------------------------------------------------------+




| Processes:                                                                  |




|  GPU   GI   CI        PID   Type   Process name                  GPU Memory |




|        ID   ID                                                   Usage      |




|=============================================================================|




|    0   N/A  N/A   2630871      C   /usr/bin/python                  8680MiB |
|    1   N/A  N/A   2630872      C   /usr/bin/python                  6408MiB |
|    2   N/A  N/A   2630873      C   /usr/bin/python                  5080MiB |
|    3   N/A  N/A   2630874      C   /usr/bin/python                  5436MiB |
+-----------------------------------------------------------------------------+

Ã¦Â¨Â¡Ã¥ÂÂÃ¥Â¹Â¶Ã¨Â¡ÂÃ¨Â®ÂÃ§Â»ÂÃ¯Â¼Â4TPÃ¯Â¼Â

tree -h /workspace/model/megatron-models/345m-init-4tp

/workspace/model/megatron-models/345m-init-4tp

Ã¢ÂÂÃ¢ÂÂÃ¢ÂÂ [4.0K]  iter_0002000


Ã¢ÂÂ   Ã¢ÂÂÃ¢ÂÂÃ¢ÂÂ [4.0K]  mp_rank_00

Ã¢ÂÂ   Ã¢ÂÂ   Ã¢ÂÂÃ¢ÂÂÃ¢ÂÂ [1.2G]  model_optim_rng.pt

...

Ã¢ÂÂ   Ã¢ÂÂÃ¢ÂÂÃ¢ÂÂ [4.0K]  mp_rank_03

Ã¢ÂÂ       Ã¢ÂÂÃ¢ÂÂÃ¢ÂÂ [1.2G]  model_optim_rng.pt

Ã¢ÂÂÃ¢ÂÂÃ¢ÂÂ [   4]  latest_checkpointed_iteration.txt



> cat /workspace/model/megatron-models/345m-init-4tp/latest_checkpointed_iteration.txt 
2000

+-----------------------------------------------------------------------------+




| Processes:                                                                  |




|  GPU   GI   CI        PID   Type   Process name                  GPU Memory |




|        ID   ID                                                   Usage      |




|=============================================================================|




|    0   N/A  N/A   3895346      C   /usr/bin/python                  4236MiB |
|    1   N/A  N/A   3895347      C   /usr/bin/python                  4176MiB |
|    2   N/A  N/A   3895348      C   /usr/bin/python                  4168MiB |
|    3   N/A  N/A   3895349      C   /usr/bin/python                  4176MiB |
+-----------------------------------------------------------------------------+

Ã¦Â¨Â¡Ã¥ÂÂÃ¥Â¹Â¶Ã¨Â¡ÂÃ¨Â®ÂÃ§Â»ÂÃ¯Â¼Â2TP+2PPÃ¯Â¼Â

> tree -h 345m-init-mp
345m-init-mp
Ã¢ÂÂÃ¢ÂÂÃ¢ÂÂ [4.0K]  iter_0005000

Ã¢ÂÂ   Ã¢ÂÂÃ¢ÂÂÃ¢ÂÂ [4.0K]  mp_rank_00_000

Ã¢ÂÂ   Ã¢ÂÂ   Ã¢ÂÂÃ¢ÂÂÃ¢ÂÂ [1.3G]  model_optim_rng.pt
Ã¢ÂÂ   Ã¢ÂÂÃ¢ÂÂÃ¢ÂÂ [4.0K]  mp_rank_00_001

Ã¢ÂÂ   Ã¢ÂÂ   Ã¢ÂÂÃ¢ÂÂÃ¢ÂÂ [1.3G]  model_optim_rng.pt
Ã¢ÂÂ   Ã¢ÂÂÃ¢ÂÂÃ¢ÂÂ [4.0K]  mp_rank_01_000
Ã¢ÂÂ   Ã¢ÂÂ   Ã¢ÂÂÃ¢ÂÂÃ¢ÂÂ [1.3G]  model_optim_rng.pt
Ã¢ÂÂ   Ã¢ÂÂÃ¢ÂÂÃ¢ÂÂ [4.0K]  mp_rank_01_001
Ã¢ÂÂ       Ã¢ÂÂÃ¢ÂÂÃ¢ÂÂ [1.3G]  model_optim_rng.pt
Ã¢ÂÂÃ¢ÂÂÃ¢ÂÂ [   4]  latest_checkpointed_iteration.txt

+-----------------------------------------------------------------------------+




| Processes:                                                                  |




|  GPU   GI   CI        PID   Type   Process name                  GPU Memory |




|        ID   ID                                                   Usage      |




|=============================================================================|




|    0   N/A  N/A   3448098      C   /usr/bin/python                  8732MiB |
|    1   N/A  N/A   3448099      C   /usr/bin/python                  8732MiB |
|    2   N/A  N/A   3448100      C   /usr/bin/python                  6828MiB |
|    3   N/A  N/A   3448101      C   /usr/bin/python                  7078MiB |
+-----------------------------------------------------------------------------+

python tools/checkpoint_util.py \
        --model-type GPT \
        --load-dir /workspace/model/megatron-models/345m-init-mp\
        --save-dir /workspace/model/megatron-models/345m-init-mp-out \
        --target-tensor-parallel-size 1 \
        --target-pipeline-parallel-size 1

Ã¦Â¨Â¡Ã¥ÂÂÃ¨Â¯ÂÃ¤Â¼Â°

sh eval_gpt2_lambada.sh

using world size: 1, data-parallel-size: 1, tensor-model-parallel size: 1, pipeline-model-parallel size: 1 
setting global batch size to 8
using torch.float16 for parameters ...
------------------------ arguments ------------------------
  accumulate_allreduce_grads_in_fp32 .............. False
  ...
  world_size ...................................... 1
-------------------- end of arguments ---------------------
setting number of micro-batches to constant 1
> building GPT2BPETokenizer tokenizer ...
> padded vocab (size: 50257) with 47 dummy tokens (new size: 50304)
> initializing torch distributed ...
> initialized tensor model parallel with size 1
> initialized pipeline model parallel with size 1
> setting random seeds to 1234 ...
> compiling dataset index builder ...
...
make: Leaving directory '/workspace/code/bak/Megatron-LM/megatron/data'
>>> done with dataset index builder. Compilation time: 13.399 seconds
> compiling and loading fused kernels ...
>>> done with compiling and loading fused kernels. Compilation time: 1.411 seconds
building GPT model ...
 > number of parameters on (tensor, pipeline) model parallel rank (0, 0): 354871296
 loading checkpoint from /workspace/model/megatron-models/345m-init-mp-out at iteration 5000
 checkpoint version 3.0
  successfully loaded checkpoint from /workspace/model/megatron-models/345m-init-mp-out at iteration 5000
> building lambada dataset from /workspace/data/lambada_test.jsonl ...
 > found 5153 samples.
> working on iteration: 0
...
> working on iteration: 640
--------------------------------------------------------------------------------------------------------------------
 validation results on LAMBADA | number correct: 0.0000E+00 | total examples: 5.1530E+03 | avg accuracy: 0.0000E+00
--------------------------------------------------------------------------------------------------------------------
done :-)

Ã¦Â¨Â¡Ã¥ÂÂÃ¦ÂÂ¨Ã§ÂÂÃ¦ÂÂÃ¥ÂÂ¡

pip install flask flask-restful -i https://pypi.tuna.tsinghua.edu.cn/simple  --trusted-host pypi.tuna.tsinghua.edu.cn

sh examples/run_text_generation_server_345M.sh

> python tools/text_generation_cli.py localhost:5000
Enter prompt: hello
Enter number of tokens to generate: 5
Megatron Response: 
hello! Until that protagonist receive
Enter prompt: world 
Enter number of tokens to generate: 2
Megatron Response: 
worldboarding-
Enter prompt:

> curl 'http://localhost:5000/api' -X 'PUT' -H 'Content-Type: application/json; charset=UTF-8'  -d '{"prompts":["Hello world"], "tokens_to_generate":1}'



{"logprobs":null,"segments":[["Hello"," world",","]],"text":["Hello world,"]}

Ã¤Â½Â¿Ã§ÂÂ¨4TPÃ¨Â¿ÂÃ¨Â¡ÂÃ¦Â¨Â¡Ã¥ÂÂÃ¦ÂÂ¨Ã§ÂÂÃ¯Â¼Â

sh examples/run_text_generation_server_345M_4_tensor_parallel.sh

Ã¦ÂÂ¾Ã¥ÂÂÃ¥ÂÂ Ã§ÂÂ¨Ã¯Â¼Â

+-----------------------------------------------------------------------------+




| Processes:                                                                  |




|  GPU   GI   CI        PID   Type   Process name                  GPU Memory |




|        ID   ID                                                   Usage      |




|=============================================================================|




|    0   N/A  N/A   1844443      C   /usr/bin/python                   788MiB |
|    1   N/A  N/A   1844444      C   /usr/bin/python                   788MiB |
|    2   N/A  N/A   1844445      C   /usr/bin/python                   788MiB |
|    3   N/A  N/A   1844446      C   /usr/bin/python                   788MiB |
+-----------------------------------------------------------------------------+

sh examples/run_text_generation_server_345M_2tp_2dp.sh

Ã¦ÂÂ¾Ã¥ÂÂÃ¥ÂÂ Ã§ÂÂ¨Ã¯Â¼Â

+-----------------------------------------------------------------------------+




| Processes:                                                                  |




|  GPU   GI   CI        PID   Type   Process name                  GPU Memory |




|        ID   ID                                                   Usage      |




|=============================================================================|




|    0   N/A  N/A   1869409      C   /usr/bin/python                  1222MiB |
|    1   N/A  N/A   1869410      C   /usr/bin/python                  1222MiB |
|    2   N/A  N/A   1869411      C   /usr/bin/python                  1222MiB |
|    3   N/A  N/A   1869412      C   /usr/bin/python                  1222MiB |
+-----------------------------------------------------------------------------+

Ã§Â»ÂÃ¨Â¯Â

文章版权归作者所有，未经允许请勿转载，侵权请联系 admin@trc20.tw 删除。

THE END

# LLM

基于Megatron-LM从0到1完成GPT2模型预训练、模型评估及推理

Ã¨Â¿ÂÃ¨Â¡ÂÃ§ÂÂ¯Ã¥Â¢ÂÃ¦ÂÂ­Ã¥Â»Âº

Ã¤Â»Â£Ã§Â ÂÃ¥ÂÂÃ¥Â¤Â

Ã¦Â¨Â¡Ã¥ÂÂÃ¦ÂÂÃ©ÂÂÃ¥ÂÂÃ¨Â¯ÂÃ¨Â¡Â¨Ã¥ÂÂÃ¥Â¤Â

Ã¦ÂÂ°Ã¦ÂÂ®Ã©ÂÂÃ¥ÂÂÃ¥Â¤Â

Ã¦ÂÂ°Ã¦ÂÂ®Ã¦Â¸Â Ã¦Â´Â

Ã¦ÂÂ°Ã¦ÂÂ®Ã©Â¢ÂÃ¥Â¤ÂÃ§ÂÂ

Ã¦Â¨Â¡Ã¥ÂÂÃ¨Â®Â­Ã§Â»Â

Ã¥ÂÂÃ¥ÂÂ¡Ã¨Â®Â­Ã§Â»Â

Ã¦ÂÂ°Ã¦ÂÂ®Ã¥Â¹Â¶Ã¨Â¡ÂÃ¨Â®Â­Ã§Â»ÂÃ¯Â¼Â4DPÃ¯Â¼Â

Ã¦Â¨Â¡Ã¥ÂÂÃ¥Â¹Â¶Ã¨Â¡ÂÃ¨Â®Â­Ã§Â»ÂÃ¯Â¼Â4PPÃ¯Â¼Â

Ã¦Â¨Â¡Ã¥ÂÂÃ¥Â¹Â¶Ã¨Â¡ÂÃ¨Â®Â­Ã§Â»ÂÃ¯Â¼Â4TPÃ¯Â¼Â

Ã¦Â¨Â¡Ã¥ÂÂÃ¥Â¹Â¶Ã¨Â¡ÂÃ¨Â®Â­Ã§Â»ÂÃ¯Â¼Â2TP+2PPÃ¯Â¼Â

Ã¦Â¨Â¡Ã¥ÂÂÃ¦ÂÂÃ©ÂÂÃ¥ÂÂÃ¥Â¹Â¶

Ã¦Â¨Â¡Ã¥ÂÂÃ¨Â¯ÂÃ¤Â¼Â°

Ã¦Â¨Â¡Ã¥ÂÂÃ¦ÂÂ¨Ã§ÂÂÃ¦ÂÂÃ¥ÂÂ¡

Ã§Â»ÂÃ¨Â¯Â­