-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathrun_batch_test.sh
executable file
·377 lines (318 loc) · 15.3 KB
/
run_batch_test.sh
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
#!/bin/bash
# Script to perform manual testing of clusx cluster & clusx evaluate commands
# This script will:
# 1. Load test parameters from a profile file
# 2. Execute clusx cluster with these parameters
# 3. Move output files to organized folders
# 4. Create JSON with performance metrics
# 5. Create readme.txt with run information
set -e # Exit on error
# Display usage information
usage() {
echo "Usage: $0 --input <input_file> --profile <profile_file> [--column <column_name>] [--random-seed <seed>] [--output <output_basename>]"
echo ""
echo "Arguments:"
echo " --input Path to the input CSV file (required)"
echo " --profile Path to the profile file with test parameters (required)"
echo " --column Column name to use for clustering (default: 'question')"
echo " --random-seed Random seed for reproducibility (default: random value)"
echo " --output Base name for output files (default: 'clusters_output')"
echo " --output-dir Directory to save output files (default: 'output')"
echo " --batch-name Name of the batch (default: current date and time)"
exit 1
}
# Parse command line arguments
INPUT_FILE=""
PROFILE_FILE=""
COLUMN="question"
RANDOM_SEED=$RANDOM # Default to a random value
OUTPUT_BASENAME="clusters_output"
OUTPUT_DIR="output"
BATCH_NAME=""
while [[ $# -gt 0 ]]; do
case $1 in
--input)
INPUT_FILE="$2"
shift 2
;;
--profile)
PROFILE_FILE="$2"
shift 2
;;
--column)
COLUMN="$2"
shift 2
;;
--random-seed)
RANDOM_SEED="$2"
shift 2
;;
--output)
OUTPUT_BASENAME="$2"
shift 2
;;
--output-dir)
OUTPUT_DIR="$2"
shift 2
;;
--batch-name)
BATCH_NAME="$2"
shift 2
;;
--help)
usage
;;
*)
echo "Unknown option: $1"
usage
;;
esac
done
# Check if input file is provided
if [ -z "$INPUT_FILE" ]; then
echo "Error: Input file is required"
usage
fi
# Check if profile file is provided
if [ -z "$PROFILE_FILE" ]; then
echo "Error: Profile file is required"
usage
fi
# Check if input file exists
if [ ! -f "$INPUT_FILE" ]; then
echo "Error: Input file '$INPUT_FILE' does not exist"
exit 1
fi
# Check if profile file exists
if [ ! -f "$PROFILE_FILE" ]; then
echo "Error: Profile file '$PROFILE_FILE' does not exist"
exit 1
fi
# Get current date and time for documentation and folder naming
CURRENT_DATE=$(date +"%Y-%m-%d")
CURRENT_DATETIME=$(date +"%Y%m%d-%H%M%S")
# If batch name is not provided, use the current date and time
if [ -z "$BATCH_NAME" ]; then
BATCH_NAME="${CURRENT_DATETIME}"
fi
# Create batch directory structure
BATCH_DIR="${OUTPUT_DIR}/batch/${BATCH_NAME}"
mkdir -p "${BATCH_DIR}"
# Load test cases from profile file
# Strip empty lines and comments (lines starting with #)
declare -A test_cases
test_num=1
while IFS= read -r line || [[ -n "$line" ]]; do
# Skip empty lines and comments
if [[ -z "$line" || "$line" =~ ^[[:space:]]*# ]]; then
continue
fi
# Store non-empty, non-comment lines as test cases
test_cases[$test_num]="$line"
((test_num++))
done < <(grep -v '^\s*$\|^\s*#' "$PROFILE_FILE")
# Get the total number of test cases
total_tests=${#test_cases[@]}
if [ $total_tests -eq 0 ]; then
echo "Error: No valid test cases found in profile file"
exit 1
fi
echo "Starting test run with input file: $INPUT_FILE"
echo "Profile file: $PROFILE_FILE"
echo "Column: $COLUMN"
echo "Random seed: $RANDOM_SEED"
echo "Output basename: $OUTPUT_BASENAME"
echo "Output directory: $OUTPUT_DIR"
echo "Batch name: $BATCH_NAME"
echo "Number of test cases: $total_tests"
echo ""
# Create temporary directory for intermediate files
TEMP_DIR="${OUTPUT_DIR}/temp"
mkdir -p "${TEMP_DIR}"
# Run each test case
for test_num in $(seq 1 $total_tests); do
echo "======================================================="
echo "Running Test $test_num"
echo "Parameters: ${test_cases[$test_num]}"
echo "======================================================="
# Extract parameters
PARAMS=${test_cases[$test_num]}
# Create run directory with proper structure
RUN_DIR="${BATCH_DIR}/${test_num}"
mkdir -p "${RUN_DIR}"
# Dynamically build output file names
DP_CLUSTERS_CSV="${TEMP_DIR}/${OUTPUT_BASENAME}_dp.csv"
PYP_CLUSTERS_CSV="${TEMP_DIR}/${OUTPUT_BASENAME}_pyp.csv"
DP_CLUSTERS_JSON="${TEMP_DIR}/${OUTPUT_BASENAME}_dp.json"
PYP_CLUSTERS_JSON="${TEMP_DIR}/${OUTPUT_BASENAME}_pyp.json"
# Run clusx cluster command
echo "Executing: clusx cluster --input $INPUT_FILE --column $COLUMN $PARAMS --random-seed $RANDOM_SEED --output $OUTPUT_BASENAME --output-dir $TEMP_DIR"
clusx cluster --input "$INPUT_FILE" --column "$COLUMN" $PARAMS --random-seed $RANDOM_SEED --output "$OUTPUT_BASENAME.csv" --output-dir "$TEMP_DIR"
# Run clusx evaluate command with proper parameters
echo "Executing: clusx evaluate --input $INPUT_FILE --column $COLUMN --dp-clusters $DP_CLUSTERS_CSV --pyp-clusters $PYP_CLUSTERS_CSV --random-seed $RANDOM_SEED --plot --output-dir $TEMP_DIR"
clusx evaluate --input "$INPUT_FILE" --column "$COLUMN" --dp-clusters "$DP_CLUSTERS_CSV" --pyp-clusters "$PYP_CLUSTERS_CSV" --random-seed $RANDOM_SEED --plot --output-dir "$TEMP_DIR"
# Move output files to run directory
echo "Moving output files to $RUN_DIR"
find "${TEMP_DIR}" -type f -name "*.json" -o -name "*.csv" -o -name "*.png" | xargs -I{} mv {} "${RUN_DIR}/"
# Extract stats and create performance metrics JSON
echo "Extracting performance metrics..."
# Check if evaluation_report.json exists
if [ -f "${RUN_DIR}/evaluation_report.json" ]; then
# Extract Dirichlet stats
d_num_clusters=$(jq '.Dirichlet.cluster_stats.num_clusters' "${RUN_DIR}/evaluation_report.json")
# Extract cluster size distribution for Dirichlet - count clusters by their sizes
d_size_1=$(jq '.Dirichlet.cluster_stats.cluster_sizes | to_entries | map(select(.value == 1)) | length' "${RUN_DIR}/evaluation_report.json")
d_size_2_5=$(jq '.Dirichlet.cluster_stats.cluster_sizes | to_entries | map(select(.value >= 2 and .value <= 5)) | length' "${RUN_DIR}/evaluation_report.json")
d_size_6plus=$(jq '.Dirichlet.cluster_stats.cluster_sizes | to_entries | map(select(.value >= 6)) | length' "${RUN_DIR}/evaluation_report.json")
# Extract powerlaw stats for Dirichlet
d_powerlaw_alpha=$(jq '.Dirichlet.metrics.powerlaw.alpha // 0' "${RUN_DIR}/evaluation_report.json")
d_is_powerlaw=$(jq '.Dirichlet.metrics.powerlaw.is_powerlaw // false' "${RUN_DIR}/evaluation_report.json")
# Extract similarity stats for Dirichlet
d_silhouette=$(jq '.Dirichlet.metrics.silhouette_score // 0' "${RUN_DIR}/evaluation_report.json")
d_intra_sim=$(jq '.Dirichlet.metrics.similarity.intra_cluster_similarity // 0' "${RUN_DIR}/evaluation_report.json")
d_inter_sim=$(jq '.Dirichlet.metrics.similarity.inter_cluster_similarity // 0' "${RUN_DIR}/evaluation_report.json")
d_silhouette_like=$(jq '.Dirichlet.metrics.similarity.silhouette_like_score // 0' "${RUN_DIR}/evaluation_report.json")
# Extract Pitman-Yor stats
py_num_clusters=$(jq '."Pitman-Yor".cluster_stats.num_clusters' "${RUN_DIR}/evaluation_report.json")
# Extract cluster size distribution for Pitman-Yor - count clusters by their sizes
py_size_1=$(jq '."Pitman-Yor".cluster_stats.cluster_sizes | to_entries | map(select(.value == 1)) | length' "${RUN_DIR}/evaluation_report.json")
py_size_2_5=$(jq '."Pitman-Yor".cluster_stats.cluster_sizes | to_entries | map(select(.value >= 2 and .value <= 5)) | length' "${RUN_DIR}/evaluation_report.json")
py_size_6plus=$(jq '."Pitman-Yor".cluster_stats.cluster_sizes | to_entries | map(select(.value >= 6)) | length' "${RUN_DIR}/evaluation_report.json")
# Extract powerlaw stats for Pitman-Yor
py_powerlaw_alpha=$(jq '."Pitman-Yor".metrics.powerlaw.alpha // 0' "${RUN_DIR}/evaluation_report.json")
py_is_powerlaw=$(jq '."Pitman-Yor".metrics.powerlaw.is_powerlaw // false' "${RUN_DIR}/evaluation_report.json")
# Extract similarity stats for Pitman-Yor
py_silhouette=$(jq '."Pitman-Yor".metrics.silhouette_score // 0' "${RUN_DIR}/evaluation_report.json")
py_intra_sim=$(jq '."Pitman-Yor".metrics.similarity.intra_cluster_similarity // 0' "${RUN_DIR}/evaluation_report.json")
py_inter_sim=$(jq '."Pitman-Yor".metrics.similarity.inter_cluster_similarity // 0' "${RUN_DIR}/evaluation_report.json")
py_silhouette_like=$(jq '."Pitman-Yor".metrics.similarity.silhouette_like_score // 0' "${RUN_DIR}/evaluation_report.json")
# Create performance metrics JSON
cat > "${RUN_DIR}/performance_metrics.json" << EOF
{
"Dirichlet": {
"num_clusters": $d_num_clusters,
"cluster_size_distribution": {"1": $d_size_1, "2-5": $d_size_2_5, "6+": $d_size_6plus},
"powerlaw": {"alpha": $d_powerlaw_alpha, "is_powerlaw": $d_is_powerlaw},
"silhouette_score": $d_silhouette,
"similarity": {
"intra": $d_intra_sim,
"inter": $d_inter_sim,
"silhouette_like": $d_silhouette_like
}
},
"Pitman-Yor": {
"num_clusters": $py_num_clusters,
"cluster_size_distribution": {"1": $py_size_1, "2-5": $py_size_2_5, "6+": $py_size_6plus},
"powerlaw": {"alpha": $py_powerlaw_alpha, "is_powerlaw": $py_is_powerlaw},
"silhouette_score": $py_silhouette,
"similarity": {
"intra": $py_intra_sim,
"inter": $py_inter_sim,
"silhouette_like": $py_silhouette_like
}
}
}
EOF
else
echo "ERROR: evaluation_report.json not found in ${RUN_DIR}" >&2
echo "Skipping test case $test_num and continuing with the next one." >&2
cat > "${RUN_DIR}/performance_metrics.json" << EOF
{
"error": "evaluation_report.json not found for this test case"
}
EOF
continue
fi
# Create readme.txt with run information and performance metrics
cat > "${RUN_DIR}/readme.txt" << EOF
Batch: $BATCH_NAME
Test Run: $test_num
Date: $CURRENT_DATE
Input File: $INPUT_FILE
Profile File: $PROFILE_FILE
Column: $COLUMN
Random Seed: $RANDOM_SEED
Output Basename: $OUTPUT_BASENAME
Parameters: ${test_cases[$test_num]}
Cluster Command:
clusx cluster --input $INPUT_FILE --column $COLUMN $PARAMS --random-seed $RANDOM_SEED --output $OUTPUT_BASENAME.csv --output-dir $TEMP_DIR
Evaluation Command:
clusx evaluate --input $INPUT_FILE --column $COLUMN --dp-clusters $DP_CLUSTERS_CSV --pyp-clusters $PYP_CLUSTERS_CSV --random-seed $RANDOM_SEED --plot --output-dir $TEMP_DIR
Performance Metrics:
$(cat "${RUN_DIR}/performance_metrics.json" | jq -r .)
EOF
echo "Test $test_num completed. Results saved to ${RUN_DIR}"
echo ""
done
# Clean up temporary directory
rm -rf "${TEMP_DIR}"
# Create a summary report
echo "Creating summary report..."
SUMMARY_FILE="${BATCH_DIR}/summary.md"
SUMMARY_CSV="${BATCH_DIR}/summary.csv"
cat > "${SUMMARY_FILE}" << EOF
# Clusx Test Summary
Batch: $BATCH_NAME
Date: $CURRENT_DATE
Input File: $INPUT_FILE
Profile File: $PROFILE_FILE
Column: $COLUMN
Random Seed: $RANDOM_SEED
Output Basename: $OUTPUT_BASENAME
## Test Cases
EOF
# Create CSV header
echo "test_num,model,dp_alpha,pyp_alpha,pyp_sigma,variance,num_clusters,cluster_size_1,cluster_size_2_5,cluster_size_6plus,powerlaw_alpha,is_powerlaw,silhouette_score,intra_similarity,inter_similarity,silhouette_like" > "${SUMMARY_CSV}"
for test_num in $(seq 1 $total_tests); do
RUN_DIR="${BATCH_DIR}/${test_num}"
# Add to markdown summary
cat >> "${SUMMARY_FILE}" << EOF
### Test $test_num
Parameters: ${test_cases[$test_num]}
\`\`\`json
$(cat "${RUN_DIR}/performance_metrics.json")
\`\`\`
EOF
# Extract metrics from performance_metrics.json
# Dirichlet model
d_num_clusters=$(jq '.Dirichlet.num_clusters' "${RUN_DIR}/performance_metrics.json")
d_size_1=$(jq '.Dirichlet.cluster_size_distribution."1"' "${RUN_DIR}/performance_metrics.json")
d_size_2_5=$(jq '.Dirichlet.cluster_size_distribution."2-5"' "${RUN_DIR}/performance_metrics.json")
d_size_6plus=$(jq '.Dirichlet.cluster_size_distribution."6+"' "${RUN_DIR}/performance_metrics.json")
d_powerlaw_alpha=$(jq '.Dirichlet.powerlaw.alpha' "${RUN_DIR}/performance_metrics.json")
d_is_powerlaw=$(jq '.Dirichlet.powerlaw.is_powerlaw' "${RUN_DIR}/performance_metrics.json")
d_silhouette=$(jq '.Dirichlet.silhouette_score' "${RUN_DIR}/performance_metrics.json")
d_intra_sim=$(jq '.Dirichlet.similarity.intra' "${RUN_DIR}/performance_metrics.json")
d_inter_sim=$(jq '.Dirichlet.similarity.inter' "${RUN_DIR}/performance_metrics.json")
d_silhouette_like=$(jq '.Dirichlet.similarity.silhouette_like' "${RUN_DIR}/performance_metrics.json")
# Pitman-Yor model
py_num_clusters=$(jq '."Pitman-Yor".num_clusters' "${RUN_DIR}/performance_metrics.json")
py_size_1=$(jq '."Pitman-Yor".cluster_size_distribution."1"' "${RUN_DIR}/performance_metrics.json")
py_size_2_5=$(jq '."Pitman-Yor".cluster_size_distribution."2-5"' "${RUN_DIR}/performance_metrics.json")
py_size_6plus=$(jq '."Pitman-Yor".cluster_size_distribution."6+"' "${RUN_DIR}/performance_metrics.json")
py_powerlaw_alpha=$(jq '."Pitman-Yor".powerlaw.alpha' "${RUN_DIR}/performance_metrics.json")
py_is_powerlaw=$(jq '."Pitman-Yor".powerlaw.is_powerlaw' "${RUN_DIR}/performance_metrics.json")
py_silhouette=$(jq '."Pitman-Yor".silhouette_score' "${RUN_DIR}/performance_metrics.json")
py_intra_sim=$(jq '."Pitman-Yor".similarity.intra' "${RUN_DIR}/performance_metrics.json")
py_inter_sim=$(jq '."Pitman-Yor".similarity.inter' "${RUN_DIR}/performance_metrics.json")
py_silhouette_like=$(jq '."Pitman-Yor".similarity.silhouette_like' "${RUN_DIR}/performance_metrics.json")
# Extract parameters from evaluation_report.json if it exists
if [ -f "${RUN_DIR}/evaluation_report.json" ]; then
dp_alpha=$(jq '.Dirichlet.parameters.alpha' "${RUN_DIR}/evaluation_report.json")
pyp_alpha=$(jq '."Pitman-Yor".parameters.alpha' "${RUN_DIR}/evaluation_report.json")
pyp_sigma=$(jq '."Pitman-Yor".parameters.sigma' "${RUN_DIR}/evaluation_report.json")
variance=$(jq '.Dirichlet.parameters.variance' "${RUN_DIR}/evaluation_report.json")
else
# This block should never be reached because of the continue in the earlier check,
# but we'll keep it as a safeguard
echo "ERROR: evaluation_report.json not found in ${RUN_DIR}" >&2
echo "Skipping test case $test_num and continuing with the next one." >&2
continue
fi
# Add Dirichlet row to CSV
echo "$test_num,Dirichlet,$dp_alpha,$pyp_alpha,$pyp_sigma,$variance,$d_num_clusters,$d_size_1,$d_size_2_5,$d_size_6plus,$d_powerlaw_alpha,$d_is_powerlaw,$d_silhouette,$d_intra_sim,$d_inter_sim,$d_silhouette_like" >> "${SUMMARY_CSV}"
# Add Pitman-Yor row to CSV
echo "$test_num,Pitman-Yor,$dp_alpha,$pyp_alpha,$pyp_sigma,$variance,$py_num_clusters,$py_size_1,$py_size_2_5,$py_size_6plus,$py_powerlaw_alpha,$py_is_powerlaw,$py_silhouette,$py_intra_sim,$py_inter_sim,$py_silhouette_like" >> "${SUMMARY_CSV}"
done
echo "Testing completed. Summary available at ${SUMMARY_FILE} and ${SUMMARY_CSV}"
echo "Results stored in ${BATCH_DIR}"