-
Notifications
You must be signed in to change notification settings - Fork 0
Expand file tree
/
Copy pathhonours.py
More file actions
95 lines (75 loc) · 3.62 KB
/
honours.py
File metadata and controls
95 lines (75 loc) · 3.62 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
import pandas as pd
import csv
source_file_full_path = ".\\source\\NewYearHonoursList2019.csv"
output_file_full_path = ".\\output\\NewYearHonoursList2019_x.csv"
df = pd.read_csv(source_file_full_path,encoding='iso-8859-1',skiprows=1)
# Suffixes we wanna identify
suffixes = [
"KG","LG","KT","LT","GCB","KCB","DCB","CB","GCMG","KCMG","DCMG","CMG","DSO","GCVO",
"KCVO","DCVO","CVO","LVO","MVO","OM","ISO","GBE","KBE","DBE","CBE","OBE","MBE","CH",
"VC","GC","KP","GCSI","GCIE","VA","CI","KCSI","KCIE","CSI","CIE","ISM","MVO","MBE",
"IOM","CGC","RRC","DSC","MC","DFC","AFC","ARRC","OBI","DCM","CGM","GM","IDSM","DSM",
"MM","DFM","AFM","SGM","IOM","CPM","QGM","RVM","BEM","KPM","KPFSM","QPM","QFSM","QAM",
"QVRM","MSM","ERD","VD","TD","UD","ED","RD","VRD","AE","QV","MNM","PC","ADC","ADC(P)",
"QHP","QHS","QHDS","QHNS","QHC","SCJ","J","LJ","P","DP","LC","CJ","MR","C","QS","SL",
"KC","QC","JP","DL","MP","MSP","AM","MLA","MHK","SHK","MLC","MEP","FRS","FBA","FRSE",
"AC","FRS","FRENG","ONZ"]
# Prefixes we wanna identify
prefixes = [
"Hon","Rt","Professor","Prof","Dr","Councillor","Colonel","Reverend","Brig","Baroness",
"Lady","Sheriff","Imam","Archbishop","Very","Rev","The","Lord","Mr","Lt","Mrs","Ms",
"Major","Capt","Captain","Miss","Sir","Commander","Doctor","Dame","His","Honour",
"Bishop","Mufti","Col","(Retd)","(rtd)","Baron","Sister","Chief","Superintendent"
]
# Word search function
def word_finder(x,list_type):
df_words = set(x.split(' '))
extract_words = set(list_type).intersection(df_words)
return ' '.join(extract_words)
# Lose any redundant commas
df['New_name'] = df['Name'].str.replace(',', '')
# Fix any weird spacings between hyphenated words
df['New_name'] = df['New_name'].str.replace(' - ', '-')
# Suffix calcs
df['Suffix_temp'] = df.New_name.apply(word_finder,args=(suffixes,))
df['Suffix_len'] = df['Suffix_temp'].str.len()
df['New_len'] = df['New_name'].str.len() - df['Suffix_temp'].str.len()
# Get suffixes
df['Suffix'] = df.apply(lambda x: x['New_name'][x['New_len']:],axis=1)
# Remove suffixes
df['New_name'] = df.apply(lambda x: x['New_name'][:x['New_len']],axis=1)
# Get prefixes
df['Prefix_temp'] = df.New_name.apply(word_finder,args=(prefixes,))
df['Prefix_len'] = df['Prefix_temp'].str.len()
df['Prefix'] = df.apply(lambda x: x['New_name'][:x['Prefix_len']],axis=1)
# Remove prefixes
df['New_name'] = df.apply(lambda x: x['New_name'][x['Prefix_len']:],axis=1)
# Get alternative names (names in brackets)
df['Alt_name'] = df['New_name'].str.extract('.*\((.*)\).*')
df['Alt_name'] = df['Alt_name'].fillna('')
# Remove names in brackets
df['New_name'] = df['New_name'].str.replace(r"\(.*\)","")
df['New_name'] = df['New_name'].str.strip()
# Get LastNames
df['LastName'] = df['New_name'].str.extract("([- A-Z'É]+$)")
# Remove LastNames
df['Len'] = df['New_name'].str.len() - df['LastName'].str.len()
df['New_name'] = df.apply(lambda x: x['New_name'][:x['Len']],axis=1)
#Get first name
df2 = pd.DataFrame(df['New_name'].str.split(None,1).tolist(),columns="FirstName MiddleNames".split())
# Create a new data frame containing all the relevant fields
df3 = pd.DataFrame()
df3['Prefix'] = df['Prefix']
df3['FirstName'] = df2['FirstName']
df3['MiddleNames'] = df2['MiddleNames'].str.strip()
df3['LastName'] = df['LastName'].str.strip()
df3['Suffix'] = df['Suffix']
df3['AKA'] = df['Alt_name']
df3['OriginalName'] = df['Name'].str.strip()
df3['Order'] = df['Order']
df3['Level'] = df['Level']
df3['Award'] = df['Award']
df3['Citation'] = df['Citation']
df3['County'] = df['County']
# output
df3.to_csv(output_file_full_path, index=None, header=True, quoting=csv.QUOTE_NONNUMERIC, encoding='iso-8859-1')