-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathremove_duplicate.py
30 lines (24 loc) · 1.12 KB
/
remove_duplicate.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
import csv
def remove_duplicates(input_file_name, output_file_name):
seen_responses = set()
unique_data = []
with open(input_file_name, 'r', newline='', encoding='utf-8') as csvfile:
reader = csv.DictReader(csvfile, delimiter='#')
for row in reader:
scenario = row['scenario'].strip()
response = row['question'].strip()
# Check if we've already seen this response
if response not in seen_responses:
seen_responses.add(response)
unique_data.append({"scenario": scenario, "question": response})
# Write the unique questions to a new CSV file
with open(output_file_name, 'w', newline='', encoding='utf-8') as csvfile:
fieldnames = ['scenario', 'question']
writer = csv.DictWriter(csvfile, fieldnames=fieldnames, delimiter='#')
writer.writeheader()
for row in unique_data:
writer.writerow(row)
if __name__ == "__main__":
input_file_name = "combined.csv"
output_file_name = "unique_output_data.csv"
remove_duplicates(input_file_name, output_file_name)