1
+ import streamlit as st
2
+ import pandas as pd
3
+ import plotly .express as px
4
+ import plotly .graph_objects as go
5
+ import pycountry
6
+ import numpy as np
7
+ from scipy .stats import norm
8
+ import random
9
+ from scipy .stats import norm
10
+ import re
11
+
12
+ data = pd .read_csv ('df2020.csv' )
13
+ df2018 = pd .read_csv ('df2018.csv' )
14
+ full_data2018 = pd .read_csv ('../Data/survey_results_public_2018.csv' )
15
+ full_data2019 = pd .read_csv ('../Data/survey_results_public_2019.csv' )
16
+ full_df2020 = pd .read_csv ('../Data/survey_results_public_2020.csv' )
17
+ df2019 = pd .read_csv ('df2019.csv' )
18
+ df2020 = data [(data ['SalaryUSD' ] < 200000 )]
19
+
20
+ # features for job satisfaction
21
+ results = pd .read_csv ("results.csv" )
22
+
23
+
24
+ #######################################
25
+ # VISUALISATION STARTS
26
+ #######################################
27
+
28
+ ######-Nikita-########
29
+
30
+ def plot_boxplot (data , x , y , title ):
31
+ fig = go .Figure ()
32
+ for group_name , group_data in data .groupby (x ):
33
+ fig .add_trace (go .Box (y = group_data [y ], name = group_name ))
34
+ fig .update_layout (title = title , xaxis_title = x , yaxis_title = y )
35
+ st .plotly_chart (fig )
36
+
37
+ #########################################################################
38
+
39
+ def plot_bar_plotly (df , column_name , top_n = 10 , height = 450 , width = 700 ):
40
+ df_counts = df [column_name ].value_counts ().head (top_n ).reset_index ()
41
+ df_counts .columns = [column_name , 'Count' ]
42
+
43
+ fig = px .bar (df_counts , x = column_name , y = 'Count' ,
44
+ labels = {column_name : column_name , 'Count' : 'Number of Developers' },
45
+ color = column_name , color_discrete_sequence = px .colors .qualitative .Pastel )
46
+
47
+ fig .update_layout (xaxis_title = column_name , yaxis_title = 'Number of Developers' )
48
+ fig .update_layout (height = height , width = width )
49
+
50
+ return st .plotly_chart (fig )
51
+
52
+
53
+ def plot_pie_plotly (df , column_name ,top_n = 10 , height = 400 , width = 400 ):
54
+ participation_rate = df [column_name ].value_counts ().keys ().tolist ()[:top_n ]
55
+ count = df [column_name ].value_counts ().tolist ()[:top_n ]
56
+
57
+ fig_pie = go .Figure (data = [go .Pie (labels = participation_rate , values = count )])
58
+ fig_pie .update_layout (title = 'Top {} Distribution' .format (column_name ))
59
+ fig_pie .update_layout (height = height , width = width )
60
+
61
+ st .plotly_chart (fig_pie )
62
+
63
+ def plot_value_counts_plotly (df , column_name ):
64
+ colors = ['lightseagreen' , 'lightgreen' , 'lightyellow' , 'lightcoral' , 'lightsalmon' , 'lavender' ]
65
+
66
+ counts = df [column_name ].value_counts ()
67
+ fig = go .Figure (go .Bar (x = counts .index , y = counts .values , marker_color = random .choice (colors )))
68
+ fig .update_layout (title = f'Value Counts for { column_name } ' , xaxis_title = 'Response' , yaxis_title = 'Count' )
69
+ return fig
70
+
71
+
72
+ def generate_normal_distribution_plots (df , column , top_n = 10 ):
73
+ countries = df [column ].value_counts ().sort_values (ascending = False )[:top_n ].index .tolist ()
74
+
75
+ for country in countries :
76
+ temp_salaries = df .loc [df [column ] == country , 'SalaryUSD' ]
77
+
78
+ #normal distribution curve
79
+ x_values = np .linspace (temp_salaries .min (), temp_salaries .max (), 100 )
80
+ y_values = norm .pdf (x_values , temp_salaries .mean (), temp_salaries .std ())
81
+
82
+ fig = go .Figure (data = go .Scatter (x = x_values , y = y_values ))
83
+
84
+ # mean line
85
+ fig .add_shape (type = "line" ,
86
+ x0 = temp_salaries .mean (), y0 = 0 ,
87
+ x1 = temp_salaries .mean (), y1 = norm .pdf (temp_salaries .mean (), temp_salaries .mean (), temp_salaries .std ()),
88
+ line = dict (color = "red" , width = 2 , dash = "dash" ))
89
+
90
+ fig .update_layout (title = 'Normal Distribution of Annual Salaries in {}' .format (country ),
91
+ xaxis_title = "Annual Salary in USD" ,
92
+ yaxis_title = "Density" )
93
+ fig .update_layout (height = 400 , width = 370 )
94
+
95
+ # st.plotly_chart(fig)
96
+ yield fig
97
+
98
+
99
+ def plot_age_distribution (df , column_name ):
100
+ df ['Age_range' ] = np .where ((df [column_name ] >= 15 ) & (df [column_name ] <= 19 ), '15 - 19 years' , 'Age_unknown' )
101
+ df ['Age_range' ] = np .where ((df [column_name ] >= 20 ) & (df [column_name ] <= 24 ), '20 - 24 years' , df ['Age_range' ])
102
+ df ['Age_range' ] = np .where ((df [column_name ] >= 25 ) & (df [column_name ] <= 29 ), '25 - 29 years' , df ['Age_range' ])
103
+ df ['Age_range' ] = np .where ((df [column_name ] >= 30 ) & (df [column_name ] <= 34 ), '30 - 34 years' , df ['Age_range' ])
104
+ df ['Age_range' ] = np .where ((df [column_name ] >= 35 ) & (df [column_name ] <= 39 ), '35 - 39 years' , df ['Age_range' ])
105
+ df ['Age_range' ] = np .where ((df [column_name ] >= 40 ) & (df [column_name ] <= 45 ), '40 - 45 years' , df ['Age_range' ])
106
+ df ['Age_range' ] = np .where ((df [column_name ] >= 46 ), '46 and above years' , df ['Age_range' ])
107
+
108
+ df_age = df .groupby (['Age_range' ]).size ().reset_index (name = 'Count' )
109
+ df_age .sort_values (by = ['Count' ], ascending = False , inplace = True )
110
+
111
+ # Plotly bar chart
112
+ fig = go .Figure (data = go .Bar (
113
+ x = df_age ['Count' ],
114
+ y = df_age ['Age_range' ],
115
+ orientation = 'h'
116
+ ))
117
+
118
+ # Update layout
119
+ fig .update_layout (
120
+ xaxis_title = 'Count' ,
121
+ yaxis_title = 'Age Range' ,
122
+ yaxis = dict (autorange = "reversed" )
123
+ )
124
+
125
+ st .plotly_chart (fig )
126
+
127
+ def counts (df , column_name , year ):
128
+ language_counts = df [column_name ].str .split (';' , expand = True ).stack ().value_counts ().to_frame (name = year )
129
+ language_counts [column_name ] = language_counts .index
130
+ language_counts .reset_index (drop = True , inplace = True )
131
+ language_counts = language_counts [[column_name , year ]]
132
+ return language_counts
133
+
134
+ def compare_column_and_plot (column ):
135
+ languagedesire_2018 = counts (df2018 , column , '2018' )
136
+ languagedesire_2019 = counts (df2019 , column , '2019' )
137
+ languagedesire_2020 = counts (df2020 , column , '2020' )
138
+
139
+ # Merge language counts for both years
140
+ languagedesire_all = pd .merge (languagedesire_2018 , languagedesire_2019 , on = column , how = 'outer' )
141
+ languagedesire_all = pd .merge (languagedesire_all , languagedesire_2020 , on = column , how = 'outer' )
142
+
143
+
144
+ # Fill NaN values with 0 and convert counts to integers
145
+ languagedesire_all .fillna (0 , inplace = True )
146
+ languagedesire_all ['2018' ] = languagedesire_all ['2018' ].astype (int )
147
+ languagedesire_all ['2019' ] = languagedesire_all ['2019' ].astype (int )
148
+ languagedesire_all ['2020' ] = languagedesire_all ['2020' ].astype (int )
149
+
150
+
151
+ languagedesire_all .set_index (column , inplace = True )
152
+
153
+ languagedesire19_20 = languagedesire_all .div (languagedesire_all .sum ())
154
+
155
+ st .write (languagedesire19_20 .head (5 ))
156
+ fig = go .Figure ()
157
+
158
+ for column in languagedesire19_20 .columns :
159
+ fig .add_trace (go .Bar (x = languagedesire19_20 .index , y = languagedesire19_20 [column ], name = column ))
160
+
161
+ fig .update_layout (
162
+ xaxis_title = column ,
163
+ yaxis_title = 'Percentages' ,
164
+ font = dict (size = 14 ),
165
+ barmode = 'group' ,
166
+ height = 600 ,
167
+ width = 800
168
+ )
169
+
170
+
171
+ st .plotly_chart (fig )
172
+
173
+ def generate_choropleth (df , column_name ):
174
+ grouped_df = df .groupby ('Country' ).size ().reset_index (name = 'Respondents' )
175
+
176
+ # ISO country code from the country name
177
+ def get_country_code (name ):
178
+ try :
179
+ return pycountry .countries .lookup (name ).alpha_3
180
+ except LookupError :
181
+ return None
182
+
183
+ # Adding country code column
184
+ grouped_df ['Country_code' ] = grouped_df ['Country' ].apply (get_country_code )
185
+
186
+ #choropleth map
187
+ fig = px .choropleth (grouped_df ,
188
+ locations = "Country_code" ,
189
+ color = column_name ,
190
+ hover_name = "Country" ,
191
+ projection = "natural earth" ,
192
+ color_continuous_scale = 'Peach' ,
193
+ range_color = [0 , 10000 ],
194
+ labels = {column_name : 'Respondents' }
195
+ )
196
+ fig .update_layout (height = 600 , width = 900 )
197
+ return st .plotly_chart (fig )
198
+
199
+ def gender_vs_top5countries (df ):
200
+ all_data = df .groupby (['Country' , 'Gender' ]).size ().reset_index (name = 'Count' )
201
+ all_data ['Total' ] = all_data .groupby ('Country' )['Count' ].transform ('sum' )
202
+ all_data ['Percentage' ] = all_data ['Count' ] / all_data ['Total' ] * 100
203
+
204
+
205
+ top_countries = all_data .groupby ('Country' )['Total' ].max ().nlargest (5 ).index
206
+ top_data = all_data [all_data ['Country' ].isin (top_countries )]
207
+
208
+ # men and women data
209
+ men_data = top_data [top_data ['Gender' ] == 'Man' ]
210
+ women_data = top_data [top_data ['Gender' ] == 'Woman' ]
211
+
212
+ fig = go .Figure ()
213
+
214
+ #bars for 'Men'
215
+ fig .add_trace (go .Bar (x = men_data ['Country' ], y = men_data ['Percentage' ], name = 'Men' , marker_color = 'darkblue' ))
216
+
217
+ #bars for 'Women'
218
+ fig .add_trace (go .Bar (x = women_data ['Country' ], y = women_data ['Percentage' ], name = 'Women' , marker_color = '#5E96E9' ))
219
+
220
+ fig .update_layout (
221
+ title = 'Gender vs Top 5 Countries in 2019' ,
222
+ xaxis_title = 'Top 5 Countries' ,
223
+ yaxis_title = 'Percentage' ,
224
+ barmode = 'group'
225
+ )
226
+
227
+ return fig
228
+ def heighest_paying_2019 ():
229
+ ds = df2019 [df2019 ['DevType' ].str .contains ('Data scientist' ) == True ]
230
+ ds_mean_salary = ds .groupby ('Country' )['SalaryUSD' ].mean ().reset_index (name = 'Mean' )
231
+ ds_mean_salary .sort_values (by = ['Mean' ], ascending = False , inplace = True )
232
+ ds_mean_salary = ds_mean_salary [(ds_mean_salary ['Mean' ] <= 280000 )]
233
+ Top_mean_salary = ds_mean_salary [:10 ]
234
+
235
+ fig = px .bar (Top_mean_salary , x = 'Mean' , y = 'Country' , orientation = 'h' ,
236
+ labels = {'Mean' : 'Average Salary in US$' , 'Country' : 'Country' },
237
+ title = 'The Top 10 highest paying data scientist countries in 2019' )
238
+
239
+ fig .update_layout (yaxis = {'categoryorder' :'total ascending' },
240
+ title = {'x' :0.5 , 'xanchor' : 'center' , 'yanchor' : 'top' })
241
+ st .plotly_chart (fig )
242
+ def heighest_paying (df ):
243
+ ds = df [df ['DevType' ].str .contains ('Data scientist' ) == True ]
244
+ ds_mean_salary = ds .groupby ('Country' )['SalaryUSD' ].mean ().reset_index (name = 'Mean' )
245
+ ds_mean_salary .sort_values (by = ['Mean' ], ascending = False , inplace = True )
246
+ ds_mean_salary = ds_mean_salary [(ds_mean_salary ['Mean' ] <= 280000 )]
247
+ Top_mean_salary = ds_mean_salary [:10 ]
248
+
249
+ fig = px .bar (Top_mean_salary , x = 'Mean' , y = 'Country' , orientation = 'h' ,
250
+ labels = {'Mean' : 'Average Salary in US$' , 'Country' : 'Country' },
251
+ title = 'The Top 10 highest paying data scientist countries ' )
252
+
253
+ fig .update_layout (yaxis = {'categoryorder' :'total ascending' },
254
+ title = {'x' :0.5 , 'xanchor' : 'center' , 'yanchor' : 'top' })
255
+ st .plotly_chart (fig )
256
+ def plot_value_counts_plotly (column_name , df , column ):
257
+ values = df [column_name ].value_counts ()
258
+ fig = go .Figure (data = [go .Bar (x = values .index , y = values .values , marker_color = random .choice (['lightseagreen' , 'lightgreen' , 'lightyellow' , 'lightcoral' , 'lightsalmon' , 'lavender' ]))])
259
+ fig .update_layout (title = f'Value Counts for { column_name } ' , xaxis_title = 'Response' , yaxis_title = 'Count' )
260
+ column .plotly_chart (fig )
261
+
262
+ def ai_graphs ():
263
+ st .title ('AI Survey Responses' )
264
+ df = full_data2018 [['AIDangerous' , 'AIInteresting' , 'AIResponsible' , 'AIFuture' ]]
265
+
266
+ df = df .applymap (lambda x : x .strip () if isinstance (x , str ) else x )
267
+
268
+ short_mapping = {
269
+ 'Algorithms making important decisions' : 'Algorithms' ,
270
+ 'Artificial intelligence surpassing human intelligence ("the singularity")' : 'AI Singularity' ,
271
+ 'Evolving definitions of "fairness" in algorithmic versus human decisions' : 'Fairness Evolution' ,
272
+ "Increasing automation of jobs" : 'Automation' ,
273
+ "The developers or the people creating the AI" : 'Developers' ,
274
+ "A governmental or other regulatory body" : 'Government/Regulatory' ,
275
+ "Prominent industry leaders" : 'Industry Leaders' ,
276
+ "Nobody" : 'No Responsibility' ,
277
+ "I'm excited about the possibilities more than worried about the dangers." : 'Excited about AI Future' ,
278
+ "I'm worried about the dangers more than I'm excited about the possibilities." : 'Worried about AI Future' ,
279
+ "I don't care about it, or I haven't thought about it." : 'Indifferent about AI Future'
280
+ }
281
+
282
+ df .replace (short_mapping , inplace = True )
283
+
284
+ col1 , col2 = st .columns (2 )
285
+
286
+ plot_value_counts_plotly ('AIDangerous' , df , col1 )
287
+ plot_value_counts_plotly ('AIInteresting' , df , col1 )
288
+ plot_value_counts_plotly ('AIResponsible' , df , col2 )
289
+ plot_value_counts_plotly ('AIFuture' , df , col2 )
290
+
291
+
292
+ def result_plot (data ):
293
+ new_index = data .Rates .sort_values (ascending = False ).index
294
+ sorted_results = data .reindex (new_index )
295
+
296
+ filtered_results = sorted_results [np .abs (sorted_results .Rates ) > 0.1 ]
297
+
298
+ #Plotly figure
299
+ fig = px .bar (
300
+ filtered_results ,
301
+ x = 'Rates' ,
302
+ y = 'Columns' ,
303
+ orientation = 'h' ,
304
+ labels = {'Rates' : 'Negative and Positive Features' , 'Columns' : 'Features' },
305
+ )
306
+
307
+ fig .update_layout (
308
+ xaxis_title = 'Negative and Positive Features' ,
309
+ yaxis_title = 'Features' ,
310
+ title_font_size = 25 ,
311
+ xaxis_title_font_size = 25 ,
312
+ yaxis_title_font_size = 25 ,
313
+ height = 800 ,
314
+ )
315
+
316
+ st .plotly_chart (fig , use_container_width = True )
0 commit comments