-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathrun_service.sh
156 lines (142 loc) · 5.07 KB
/
run_service.sh
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
export COMPOSE_HTTP_TIMEOUT=1200
export DOCKER_CLIENT_TIMEOUT=1200
RESTART_FLAG=0
BOOST=''
STAGE_FILE='ds_config_stage3.json'
per_device_train_batch_size=16
TRAIN_DATA=''
HF_TOKEN=''
SKIP=''
SERVER_IP='127.0.0.1'
TRAIN_CONTAINER='gpt-insight-model-gpt-j'
INFERENCE_CONTAINER='gpt-insight-inference-gpt-j'
INTERFACE_CONTAINER='gpt-insight-web-gpt-j'
while getopts i:d:b:t:s: flag
do
case "$flag" in
i) SERVER_IP=${OPTARG};;
d) TRAIN_DATA=${OPTARG};;
b) BOOST=${OPTARG};;
t) HF_TOKEN=${OPTARG};;
s) SKIP=${OPTARG};
esac
done
if [ -n "$BOOST" ]
then
echo "boost: $BOOST"
echo "Use boost settings."
STAGE_FILE='ds_config_stage2.json'
per_device_train_batch_size=16
fi
if [[ $(lspci | grep -i nvidia) ]] && [[ ! $(nvidia-smi) ]];
then
sudo add-apt-repository -y ppa:graphics-drivers
sudo apt-get update
sudo apt-get install -y nvidia-driver-520
sudo apt-get -y install nvidia-cuda-toolkit
if [ -f "/usr/share/X11/xorg.conf.d/10-nvidia.conf" ];
then
sudo > /usr/share/X11/xorg.conf.d/10-nvidia.conf
fi
sudo systemctl set-default multi-user.target
RESTART_FLAG=1
elif [[ $(lspci | grep -i nvidia) ]];
then
echo "nvidia driver and tool has been installed."
else
echo "nvidia gpu is not detected. skip nvidia driver and tool install"
fi
if [[ ! $(docker --version) ]];
then
sudo apt-get update -y
sudo apt-get install -y apt-transport-https ca-certificates curl gnupg-agent software-properties-common
curl -fsSL https://download.docker.com/linux/ubuntu/gpg | sudo apt-key add -
sudo add-apt-repository "deb [arch=amd64] https://download.docker.com/linux/ubuntu bionic stable"
sudo apt-get update -y
sudo apt-get install -y docker-ce
sudo usermod -a -G docker $USER
RESTART_FLAG=1
else
echo "docker has been installed."
docker network inspect nginx_network --format {{.Id}} >/dev/null 2>&1 || docker network create --driver bridge nginx_network
fi
if [[ $(lspci | grep -i nvidia) ]] && [[ ! $(nvidia-docker version) ]];
then
curl -s -L https://nvidia.github.io/nvidia-docker/gpgkey | sudo apt-key add -
distribution=$(. /etc/os-release;echo $ID$VERSION_ID)
curl -s -L https://nvidia.github.io/nvidia-docker/$distribution/nvidia-docker.list | sudo tee /etc/apt/sources.list.d/nvidia-docker.list
sudo apt-get update -y
sudo apt-get install -y nvidia-docker2
sudo systemctl restart docker.service
RESTART_FLAG=1
elif [[ $(lspci | grep -i nvidia) ]];
then
echo "nvidia-docker has been installed."
else
echo "nvidia gpu is not detected. skip nvidia-docker install"
fi
if [[ ! $(docker compose version) ]];
then
sudo apt-get install -y docker-compose-plugin
else
echo "docker compose has been installed."
fi
if [[ $RESTART_FLAG == 1 ]];
then
sudo reboot
fi
base_path=$(pwd)
docker-compose -f $base_path/docker-compose.yml down
export HF_TOKEN=$HF_TOKEN
if [ -n "$TRAIN_DATA" ]
then
cp $TRAIN_DATA $base_path/data/raw_data.xlsx
fi
if [[ ! $SKIP == "skip-train" ]]
then
num_gpus=$(nvidia-smi --list-gpus | wc -l)
gradient_accumulation_steps=$((128/$per_device_train_batch_size/$num_gpus))
docker-compose -f $base_path/docker-compose.yml up -d $TRAIN_CONTAINER
docker exec -it $TRAIN_CONTAINER bash -c "python3 preprocess.py \
--data /data/raw_data.xlsx \
--data_format gpt-j \
--output_dir /data \
--max_len 2048"
docker exec -it $TRAIN_CONTAINER bash -c "deepspeed --num_gpus=$num_gpus run_clm.py \
--deepspeed $STAGE_FILE \
--model_name_or_path EleutherAI/gpt-j-6B \
--train_file /data/train.csv \
--validation_file /data/validation.csv \
--do_train \
--do_eval \
--bf16 \
--overwrite_cache \
--evaluation_strategy=steps \
--output_dir /model/fine-tunning \
--num_train_epochs 6 \
--eval_steps 100 \
--gradient_accumulation_steps $gradient_accumulation_steps \
--per_device_train_batch_size $per_device_train_batch_size \
--use_fast_tokenizer False \
--learning_rate 5e-06 \
--warmup_steps 10 \
--save_total_limit 1 \
--save_steps 100 \
--save_strategy steps \
--tokenizer_name EleutherAI/gpt-j-6B \
--load_best_model_at_end=True \
--block_size=2048 \
--overwrite_output_dir"
docker-compose -f $base_path/docker-compose.yml down
fi
if [[ ! $SKIP == "skip-inference" ]]
then
echo "set server ip: $SERVER_IP"
cp $base_path/build/interface/web/gpt_qa/template/main.js.temp $base_path/build/interface/web/gpt_qa/main.js
sed -i "s/SERVER_IP/$SERVER_IP/g" $base_path/build/interface/web/gpt_qa/main.js
docker-compose -f $base_path/docker-compose.yml up -d $INTERFACE_CONTAINER
docker-compose -f $base_path/docker-compose.yml up -d $INFERENCE_CONTAINER
echo "Please enter the URL link below into your browser to activate the dialog interface
URL link: http://$SERVER_IP/gpt/qa/"
fi
unset HF_TOKEN