-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathrun.py
129 lines (108 loc) · 5.39 KB
/
run.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
import argparse
import numpy as np
# import pandas as pd
import os
os.environ['TF_CPP_MIN_LOG_LEVEL'] = '3'
import time
# from scipy import stats
from utils import sequence_modeling
from utils import get_face_areas
from tqdm import tqdm
from utils.get_models import load_weights_EE, load_weights_LSTM, resnet18, cross_modal_attention
from utils import three_d_resnet_bi_lstm
from utils import get_log_mel
import warnings
warnings.filterwarnings('ignore', category = FutureWarning)
parser = argparse.ArgumentParser(description="run")
parser.add_argument('--path_video', type=str, default='./test_video/', help='Path to all videos')
parser.add_argument('--path_save', type=str, default='report/', help='Path to save the report')
parser.add_argument('--conf_d', type=float, default=0.7, help='Elimination threshold for false face areas')
parser.add_argument('--path_FE_model', type=str, default="./models/EMOAffectNet_LSTM/EmoAffectnet/weights.h5",
help='Path to a model for feature extraction')
parser.add_argument('--path_LSTM_model', type=str, default="./models/EMOAffectNet_LSTM/LSTM/weights.h5",
help='Path to a model for emotion prediction')
args = parser.parse_args()
EE_model = load_weights_EE(args.path_FE_model)
LSTM_model = load_weights_LSTM(args.path_LSTM_model)
model_video = three_d_resnet_bi_lstm.build_three_d_resnet_18((60, 88, 88, 3), 12, 'softmax', None,True, '3D')
model_video.build((None, 60, 88, 88, 3))
model_audio = resnet18(input_shape=(128,251,1), number_class=12)
model_av = cross_modal_attention(number_class=12)
def pred_one_video(path):
all_time = time.time()
label_phrase = ["DFA", "IEO", "IOM",
"ITH", "ITS", "IWL",
"IWW", "MTI", "TAI",
"TIE", "TSI", "WSI"]
start_time = time.time()
detect = get_face_areas.VideoCamera(path_video=path, conf=args.conf_d)
dict_face_areas, dict_lips_areas, total_frame = detect.get_frame()
end_pre_video = time.time() - start_time
name_frames = list(dict_face_areas.keys())
face_areas = dict_face_areas.values()
features = EE_model(np.stack(face_areas)).numpy()
start_time = time.time()
seq_features = sequence_modeling.get_sequence(name_frames, features)
pred_emo = LSTM_model(np.stack(seq_features)).numpy()
pred_sum = np.sum(pred_emo, axis=0)
sum = np.sum(pred_sum)
pred_emo = pred_sum/sum
sort_pred = np.argsort(-pred_emo)
if sort_pred[0]==3:
pred_emo=sort_pred[1]
else:
pred_emo=sort_pred[0]
end_emo = time.time() - start_time
if pred_emo==0:
weights_video = './models/VALENCE_VIDEO/{}/weights.h5'.format('NEU')
weights_audio = './models/VALENCE_AUDIO/{}/weights.h5'.format('NEU')
weights_av = './models/VALENCE_AV/{}/weights.h5'.format('NEU')
p_emo = 'NEU'
elif pred_emo==1:
weights_video = './models/VALENCE_VIDEO/{}/weights.h5'.format('POS')
weights_audio = './models/VALENCE_AUDIO/{}/weights.h5'.format('POS')
weights_av = './models/VALENCE_AV/{}/weights.h5'.format('POS')
p_emo = 'POS'
else:
weights_video = './models/VALENCE_VIDEO/{}/weights.h5'.format('NEG')
weights_audio = './models/VALENCE_AUDIO/{}/weights.h5'.format('NEG')
weights_av = './models/VALENCE_AV/{}/weights.h5'.format('NEG')
p_emo = 'NEG'
model_video.load_weights(weights_video)
model_audio.load_weights(weights_audio)
model_av.load_weights(weights_av)
start_time = time.time()
seg_features_audio = get_log_mel.get_log_mel(path)
prob_audio = model_audio(np.stack(seg_features_audio)).numpy()
end_audio = time.time() - start_time
start_time = time.time()
name_frames_lips = list(dict_lips_areas.keys())
seq_features_lips = sequence_modeling.get_sequence(name_frames_lips, np.stack(dict_lips_areas.values()), win = 60, step = 30)
prob_video = model_video(np.stack(seq_features_lips)).numpy()
end_video = time.time() - start_time
start_time = time.time()
prob_audio = np.pad(prob_audio, ((0, 3-prob_audio.shape[0]), (0, 0)), 'mean')
prob_audio = np.expand_dims(prob_audio, axis=0)
prob_video = np.pad(prob_video, ((0, 3-prob_video.shape[0]), (0, 0)), 'mean')
prob_video = np.expand_dims(prob_video, axis=0)
prob_av = model_av([prob_audio, prob_video])
end_time = time.time() - start_time
all_time_end = time.time() - all_time
print('Path: ', path)
print('Predicted emotion: ', p_emo)
print('Predicted phrase AV: ', label_phrase[np.argmax(prob_av[0])])
print('Lead time video pre-processing: {} s'.format(np.round(end_pre_video, 2)))
print('Lead time VER: {} s'.format(np.round(end_emo, 2)))
print('Lead time VSR: {} s'.format(np.round(end_video, 2)))
print('Lead time ASR: {} s'.format(np.round(end_audio, 2)))
print('Lead time AVSR: {} s'.format(np.round(end_time, 2)))
print('Lead time AVSR: {} s'.format(np.round(end_emo+end_video+end_audio+end_time+end_pre_video, 2)))
print('Lead time ALL: {} s'.format(np.round(all_time_end, 2)))
print()
def pred_all_video():
path_all_videos = os.listdir(args.path_video)
for id, cr_path in tqdm(enumerate(path_all_videos)):
print('{}/{}'.format(id+1, len(path_all_videos)))
pred_one_video(os.path.join(args.path_video,cr_path))
if __name__ == "__main__":
pred_all_video()