-
Notifications
You must be signed in to change notification settings - Fork 2
/
Copy pathclean-data.py
70 lines (58 loc) · 1.95 KB
/
clean-data.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
# encoding: utf-8
#
# Dieses Script bereinigt die rohen Wahlergebnis-CSVs.
#
import csv
import glob
def clean_data():
def decode(row):
return [s.decode('iso-8859-1') for s in row]
def encode(row):
return [s.encode('utf-8') for s in row]
for f in glob.glob('data/btw*_kerg.csv'):
print f
f_out = f.replace('data/', 'data/cleaned/')
out = csv.writer(open(f_out, 'w'), dialect='excel-tab')
data = csv.reader(open(f), delimiter=';')
i = 0
row = data.next()
while row[0] != 'Nr' and row[0] != 'Wahlkreis':
row = data.next()
# pull in next two rows
header = []
# fill right
header_s = [row, data.next()]
if f != 'data/btw94_kerg.csv':
header_s.append(data.next())
for row in header_s:
row = decode(row)
for c in range(len(row)):
if c > 0 and row[c] == '' and row[c-1] != 'Name':
row[c] = row[c-1]
header.append(row)
single_row_head = []
for c in range(len(header[0])):
k = []
for h in header:
if len(h) > c and h[c] != '' and h[c] != u'Endgültig':
k.append(h[c])
single_row_head.append('.'.join(k))
single_row_head[0] = 'Nr'
single_row_head[1] = 'Wahlkreis'
single_row_head[2] = 'Land'
nrow = []
for h in single_row_head:
if 'Vorperiode' not in h:
nrow.append(h)
print len(nrow)
out.writerow(encode(nrow))
for row in data:
row = decode(row)
if len(row) > 1 and row[0] != '' and int(row[0]) < 900:
nrow = []
for i in range(len(row)):
if 'Vorperiode' not in single_row_head[i]:
nrow.append(row[i])
out.writerow(encode(nrow))
if __name__ == "__main__":
clean_data()