-
Notifications
You must be signed in to change notification settings - Fork 2
/
Copy pathconvert_data.py
executable file
·166 lines (153 loc) · 5.37 KB
/
convert_data.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
import numpy as np
import tensorflow as tf
import os
from shutil import copyfile, rmtree
#数据打包处理,将原始图像数据集转换成Keras要求的格式:
#每一个子文件夹代表一类,其中有该类所有的图像数据
'''
data
customs folder
classA
image1
image2
...
classB
image1
image2
...
classC
...
...
research
test
0
image1
image2
...
1
image1
image2
...
2
image1
image2
...
train
0
image1
image2
...
1
image1
image2
...
2
image1
image2
...
validation
0
image1
image2
...
1
image1
image2
...
...
'''
#图像原始数据文件夹
source_data_folder = "F://ai_data/camelyon17/train_data"
#新的文件夹
research_data_folder = "F://ai_data/camelyon17/research_data"
#类名文本
label_text_file = source_data_folder + "//labels.txt"
train_num = 400000 #用于训练的图像数目
val_num = 941 #用于训练测试的图像数目
test_num = 4000 #用于最终测试的图像数目
def convert_class_data():
np.random.seed(0) #使用统一的Seed,保证每次随机的结果都相同
#打开已经生成的标签文件
label_file = open(label_text_file)
#按行读取标签文件中的文本信息
labels = label_file.readlines()
#随机打乱标签文本信息的顺序
np.random.shuffle(labels)
current_i = 0
current_i = save_images(current_i=current_i, phase="train", d_size=train_num, labels=labels)
current_i = save_images(current_i=current_i, phase="test", d_size=test_num, labels=labels)
current_i = save_images(current_i=current_i, phase="validation", d_size=val_num, labels=labels)
def save_images(current_i, phase, d_size, labels):
if phase == "train": #选择存储训练集数据
dst_folder = research_data_folder + "\\train\\"
elif phase == "test": #选择存储测试集数据
dst_folder = research_data_folder + "\\test\\"
elif phase == "validation": #选择存储训练测试集数据
dst_folder = research_data_folder + "\\validation\\"
else:
print("phase error : {0}".format(phase))
exit()
#打开新的标签文本文件,准备录入不同数据集的标签信息,以作备用
label_file = open(research_data_folder+"\\"+phase+"_label.txt", mode="w")
for i in range(current_i, current_i+d_size):
#获取被打乱顺序的标签
item = labels[i]
#根据空格分割文件名称和类别名称
r = item.split(" ")
#获取文件名称
img_source_path = r[0]
#获取类别名称,注意需要把最后的换行符去掉
img_class_name = r[1].split("\n")[0]
#创建新的路径,以拷贝图像文件
img_dst_path = dst_folder + img_class_name + "\\" + os.path.basename(img_source_path)
#如果新的路径不存在,则新建文件夹
if not os.path.exists(os.path.dirname(img_dst_path)):
os.makedirs(os.path.dirname(img_dst_path))
#将文件拷贝到新的路径中
copyfile(img_source_path, img_dst_path)
print("{0} copied".format(img_dst_path))
#顺手完成标签文本文件,以作备用
label_text = img_dst_path + " " + img_class_name + "\n"
#标签写入新的文本文件
label_file.write(label_text)
current_i = i
label_file.close()
return current_i
def image_labeling():
#数据目录
directories = []
#类别名称
class_names = []
#图像文件列表
image_filenames = []
#在数据根目录下寻找文件夹
for filename in os.listdir(source_data_folder):
#定位当前文件夹
path = os.path.join(source_data_folder, filename)
#如果路径为path的是文件夹
if os.path.isdir(path):
directories.append(path) #录入数据目录
#循环数据目录文件夹
for i, directory in enumerate(directories):
#在数据目录文件夹中遍历图像文件
for filename in os.listdir(directory):
path = os.path.join(directory, filename)
#加入所有图像文件名
image_filenames.append(path)
#加入图像所对应的标签编号
class_names.append(str(i))
#打开标签文本文件,准备录入标签数据
label_file = open(label_text_file, mode="w")
for idx, item in enumerate(image_filenames):
text = item + " " + class_names[idx] + "\n"
print(text)
label_file.write(text)
label_file.close()
def main():
print("Start to convert data")
image_labeling()
convert_class_data()
if __name__ == '__main__':
main()
if __name__ == '__main__':
main()