forked from thuml/Time-Series-Library
-
Notifications
You must be signed in to change notification settings - Fork 1
/
Copy pathclean_data.py
115 lines (91 loc) · 3.95 KB
/
clean_data.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
import csv
import os
import shutil
from colorama import init, Fore
from exp.exp_basic import Exp_Basic
from hyper_parameter_optimizer import basic_settings
init(autoreset=True)
root_path = '.'
data_dir = 'data'
print('Start cleaning data...')
# build basic experiment
exp_basic = Exp_Basic(root_path=root_path, args=None, try_model=True, save_process=True, initialize_later=True)
# get all root folders
checkpoints_folder = exp_basic.root_checkpoints_path
process_folder = exp_basic.root_process_path
results_folder = exp_basic.root_results_path
test_results_folder = exp_basic.root_test_results_path
m4_results_folder = exp_basic.root_m4_results_path
prob_results_folder = exp_basic.root_prob_results_path
print('Root folders:')
print(f"\tCheckpoints folder: {checkpoints_folder}")
print(f"\tProcess folder: {process_folder}")
print(f"\tResults folder: {results_folder}")
print(f"\tTest results folder: {test_results_folder}")
print(f"\tM4 results folder: {m4_results_folder}")
print(f"\tProb results folder: {prob_results_folder}")
# get data folder
data_folder = os.path.join(root_path, data_dir)
print('Data folder:')
print(f"\tData folder: {data_folder}")
# get fieldnames
fieldnames = basic_settings.get_fieldnames('all')
print('Fieldnames:')
print(f"\tFieldnames: {fieldnames}")
def clean_blank_folder(_folders):
# clean blank folder under root folder
clean_number = 0
blank_settings = []
for folder in _folders:
if os.path.exists(folder):
for path in os.listdir(folder):
setting_folder = os.path.join(folder, path)
if os.path.isdir(setting_folder) and not os.listdir(setting_folder):
os.rmdir(setting_folder)
if path not in blank_settings:
blank_settings.append(path)
clean_number += 1
if clean_number != 0:
print(Fore.RED + f"Cleaned {clean_number} blank folders")
for blank_setting in blank_settings:
print(Fore.RED + f"\t{blank_setting}")
clean_blank_folder([checkpoints_folder, process_folder, results_folder, test_results_folder, m4_results_folder,
prob_results_folder])
def clean_unrelated_folder(_folders):
# scan all csv files under data folder
file_paths = []
for root, dirs, files in os.walk(str(data_folder)):
for _file in files:
if _file.endswith('.csv') and _file not in file_paths:
_append_path = os.path.join(root, _file)
file_paths.append(_append_path)
print(f"Found {len(file_paths)} csv files under data folder")
# read csv files
setting_list = []
for file_path in file_paths:
with open(file_path, 'r') as csv_file:
reader = csv.DictReader(csv_file, fieldnames=fieldnames)
next(reader) # skip the header
for row in reader:
setting = row['setting']
if setting not in setting_list:
setting_list.append(setting)
print(f"Found {len(setting_list)} settings in all csv files")
# clean unrelated folders
clean_number = 0
unrelated_settings = []
for folder in _folders:
if os.path.exists(folder):
for path in os.listdir(folder):
setting_folder = os.path.join(folder, path)
if os.path.isdir(setting_folder) and path not in setting_list:
shutil.rmtree(setting_folder)
if path not in unrelated_settings:
unrelated_settings.append(path)
clean_number += 1
if clean_number != 0:
print(Fore.RED + f"Cleaned {clean_number} folders that are not experiments in csv files under data folder")
for unrelated_setting in unrelated_settings:
print(Fore.RED + f"\t{unrelated_setting}")
clean_unrelated_folder([checkpoints_folder, process_folder, results_folder, test_results_folder, m4_results_folder,
prob_results_folder])