Skip to content

Commit 15778b3

Browse files
authored
Add files via upload
commit -m"created a webpage for analysis > co-authored-by: NIKITA320495 [email protected] > co-authored-by: Leena2403 [email protected]"
1 parent bd80f4a commit 15778b3

File tree

1 file changed

+316
-0
lines changed

1 file changed

+316
-0
lines changed

functions.py

Lines changed: 316 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,316 @@
1+
import streamlit as st
2+
import pandas as pd
3+
import plotly.express as px
4+
import plotly.graph_objects as go
5+
import pycountry
6+
import numpy as np
7+
from scipy.stats import norm
8+
import random
9+
from scipy.stats import norm
10+
import re
11+
12+
data = pd.read_csv('df2020.csv')
13+
df2018 = pd.read_csv('df2018.csv')
14+
full_data2018 = pd.read_csv('../Data/survey_results_public_2018.csv')
15+
full_data2019=pd.read_csv('../Data/survey_results_public_2019.csv')
16+
full_df2020 = pd.read_csv('../Data/survey_results_public_2020.csv')
17+
df2019 = pd.read_csv('df2019.csv')
18+
df2020 = data[(data['SalaryUSD'] < 200000)]
19+
20+
# features for job satisfaction
21+
results = pd.read_csv("results.csv")
22+
23+
24+
#######################################
25+
# VISUALISATION STARTS
26+
#######################################
27+
28+
######-Nikita-########
29+
30+
def plot_boxplot(data, x, y, title):
31+
fig = go.Figure()
32+
for group_name, group_data in data.groupby(x):
33+
fig.add_trace(go.Box(y=group_data[y], name=group_name))
34+
fig.update_layout(title=title, xaxis_title=x, yaxis_title=y)
35+
st.plotly_chart(fig)
36+
37+
#########################################################################
38+
39+
def plot_bar_plotly(df, column_name, top_n=10, height=450, width=700):
40+
df_counts = df[column_name].value_counts().head(top_n).reset_index()
41+
df_counts.columns = [column_name, 'Count']
42+
43+
fig = px.bar(df_counts, x=column_name, y='Count',
44+
labels={column_name: column_name, 'Count': 'Number of Developers'},
45+
color=column_name, color_discrete_sequence=px.colors.qualitative.Pastel)
46+
47+
fig.update_layout(xaxis_title=column_name, yaxis_title='Number of Developers')
48+
fig.update_layout(height=height, width=width)
49+
50+
return st.plotly_chart(fig)
51+
52+
53+
def plot_pie_plotly(df, column_name,top_n=10, height=400, width=400 ):
54+
participation_rate = df[column_name].value_counts().keys().tolist()[:top_n]
55+
count = df[column_name].value_counts().tolist()[:top_n]
56+
57+
fig_pie = go.Figure(data=[go.Pie(labels=participation_rate, values=count)])
58+
fig_pie.update_layout(title='Top {} Distribution'.format(column_name))
59+
fig_pie.update_layout(height=height, width=width)
60+
61+
st.plotly_chart(fig_pie)
62+
63+
def plot_value_counts_plotly(df, column_name):
64+
colors = ['lightseagreen', 'lightgreen', 'lightyellow', 'lightcoral', 'lightsalmon', 'lavender']
65+
66+
counts = df[column_name].value_counts()
67+
fig = go.Figure(go.Bar(x=counts.index, y=counts.values, marker_color=random.choice(colors)))
68+
fig.update_layout(title=f'Value Counts for {column_name}', xaxis_title='Response', yaxis_title='Count')
69+
return fig
70+
71+
72+
def generate_normal_distribution_plots(df, column, top_n=10):
73+
countries = df[column].value_counts().sort_values(ascending=False)[:top_n].index.tolist()
74+
75+
for country in countries:
76+
temp_salaries = df.loc[df[column] == country, 'SalaryUSD']
77+
78+
#normal distribution curve
79+
x_values = np.linspace(temp_salaries.min(), temp_salaries.max(), 100)
80+
y_values = norm.pdf(x_values, temp_salaries.mean(), temp_salaries.std())
81+
82+
fig = go.Figure(data=go.Scatter(x=x_values, y=y_values))
83+
84+
# mean line
85+
fig.add_shape(type="line",
86+
x0=temp_salaries.mean(), y0=0,
87+
x1=temp_salaries.mean(), y1=norm.pdf(temp_salaries.mean(), temp_salaries.mean(), temp_salaries.std()),
88+
line=dict(color="red", width=2, dash="dash"))
89+
90+
fig.update_layout(title='Normal Distribution of Annual Salaries in {}'.format(country),
91+
xaxis_title="Annual Salary in USD",
92+
yaxis_title="Density")
93+
fig.update_layout(height=400, width=370)
94+
95+
# st.plotly_chart(fig)
96+
yield fig
97+
98+
99+
def plot_age_distribution(df, column_name):
100+
df['Age_range'] = np.where((df[column_name] >= 15) & (df[column_name] <= 19), '15 - 19 years', 'Age_unknown')
101+
df['Age_range'] = np.where((df[column_name] >= 20) & (df[column_name] <= 24), '20 - 24 years', df['Age_range'])
102+
df['Age_range'] = np.where((df[column_name] >= 25) & (df[column_name] <= 29), '25 - 29 years', df['Age_range'])
103+
df['Age_range'] = np.where((df[column_name] >= 30) & (df[column_name] <= 34), '30 - 34 years', df['Age_range'])
104+
df['Age_range'] = np.where((df[column_name] >= 35) & (df[column_name] <= 39), '35 - 39 years', df['Age_range'])
105+
df['Age_range'] = np.where((df[column_name] >= 40) & (df[column_name] <= 45), '40 - 45 years', df['Age_range'])
106+
df['Age_range'] = np.where((df[column_name] >= 46), '46 and above years', df['Age_range'])
107+
108+
df_age = df.groupby(['Age_range']).size().reset_index(name='Count')
109+
df_age.sort_values(by=['Count'], ascending=False, inplace=True)
110+
111+
# Plotly bar chart
112+
fig = go.Figure(data=go.Bar(
113+
x=df_age['Count'],
114+
y=df_age['Age_range'],
115+
orientation='h'
116+
))
117+
118+
# Update layout
119+
fig.update_layout(
120+
xaxis_title='Count',
121+
yaxis_title='Age Range',
122+
yaxis=dict(autorange="reversed")
123+
)
124+
125+
st.plotly_chart(fig)
126+
127+
def counts(df, column_name, year):
128+
language_counts = df[column_name].str.split(';', expand=True).stack().value_counts().to_frame(name=year)
129+
language_counts[column_name] = language_counts.index
130+
language_counts.reset_index(drop=True, inplace=True)
131+
language_counts = language_counts[[column_name, year]]
132+
return language_counts
133+
134+
def compare_column_and_plot(column):
135+
languagedesire_2018 = counts(df2018, column, '2018')
136+
languagedesire_2019 = counts(df2019, column, '2019')
137+
languagedesire_2020 = counts(df2020, column, '2020')
138+
139+
# Merge language counts for both years
140+
languagedesire_all = pd.merge(languagedesire_2018, languagedesire_2019, on=column, how='outer')
141+
languagedesire_all = pd.merge(languagedesire_all, languagedesire_2020, on=column, how='outer')
142+
143+
144+
# Fill NaN values with 0 and convert counts to integers
145+
languagedesire_all.fillna(0, inplace=True)
146+
languagedesire_all['2018'] = languagedesire_all['2018'].astype(int)
147+
languagedesire_all['2019'] = languagedesire_all['2019'].astype(int)
148+
languagedesire_all['2020'] = languagedesire_all['2020'].astype(int)
149+
150+
151+
languagedesire_all.set_index(column, inplace=True)
152+
153+
languagedesire19_20 = languagedesire_all.div(languagedesire_all.sum())
154+
155+
st.write(languagedesire19_20.head(5))
156+
fig = go.Figure()
157+
158+
for column in languagedesire19_20.columns:
159+
fig.add_trace(go.Bar(x=languagedesire19_20.index, y=languagedesire19_20[column], name=column))
160+
161+
fig.update_layout(
162+
xaxis_title=column,
163+
yaxis_title='Percentages',
164+
font=dict(size=14),
165+
barmode='group',
166+
height=600,
167+
width=800
168+
)
169+
170+
171+
st.plotly_chart(fig)
172+
173+
def generate_choropleth(df, column_name):
174+
grouped_df = df.groupby('Country').size().reset_index(name='Respondents')
175+
176+
# ISO country code from the country name
177+
def get_country_code(name):
178+
try:
179+
return pycountry.countries.lookup(name).alpha_3
180+
except LookupError:
181+
return None
182+
183+
# Adding country code column
184+
grouped_df['Country_code'] = grouped_df['Country'].apply(get_country_code)
185+
186+
#choropleth map
187+
fig = px.choropleth(grouped_df,
188+
locations="Country_code",
189+
color=column_name,
190+
hover_name="Country",
191+
projection="natural earth",
192+
color_continuous_scale='Peach',
193+
range_color=[0, 10000],
194+
labels={column_name: 'Respondents'}
195+
)
196+
fig.update_layout(height=600, width=900)
197+
return st.plotly_chart(fig)
198+
199+
def gender_vs_top5countries(df):
200+
all_data = df.groupby(['Country', 'Gender']).size().reset_index(name='Count')
201+
all_data['Total'] = all_data.groupby('Country')['Count'].transform('sum')
202+
all_data['Percentage'] = all_data['Count'] / all_data['Total'] * 100
203+
204+
205+
top_countries = all_data.groupby('Country')['Total'].max().nlargest(5).index
206+
top_data = all_data[all_data['Country'].isin(top_countries)]
207+
208+
# men and women data
209+
men_data = top_data[top_data['Gender'] == 'Man']
210+
women_data = top_data[top_data['Gender'] == 'Woman']
211+
212+
fig = go.Figure()
213+
214+
#bars for 'Men'
215+
fig.add_trace(go.Bar(x=men_data['Country'], y=men_data['Percentage'], name='Men', marker_color='darkblue'))
216+
217+
#bars for 'Women'
218+
fig.add_trace(go.Bar(x=women_data['Country'], y=women_data['Percentage'], name='Women', marker_color='#5E96E9'))
219+
220+
fig.update_layout(
221+
title='Gender vs Top 5 Countries in 2019',
222+
xaxis_title='Top 5 Countries',
223+
yaxis_title='Percentage',
224+
barmode='group'
225+
)
226+
227+
return fig
228+
def heighest_paying_2019():
229+
ds = df2019[df2019['DevType'].str.contains('Data scientist') == True ]
230+
ds_mean_salary = ds.groupby('Country')['SalaryUSD'].mean().reset_index(name='Mean')
231+
ds_mean_salary.sort_values(by=['Mean'], ascending=False, inplace=True)
232+
ds_mean_salary = ds_mean_salary[(ds_mean_salary['Mean'] <= 280000)]
233+
Top_mean_salary = ds_mean_salary[:10]
234+
235+
fig = px.bar(Top_mean_salary, x='Mean', y='Country', orientation='h',
236+
labels={'Mean': 'Average Salary in US$', 'Country': 'Country'},
237+
title='The Top 10 highest paying data scientist countries in 2019')
238+
239+
fig.update_layout(yaxis={'categoryorder':'total ascending'},
240+
title={'x':0.5, 'xanchor': 'center', 'yanchor': 'top'})
241+
st.plotly_chart(fig)
242+
def heighest_paying(df):
243+
ds = df[df['DevType'].str.contains('Data scientist') == True ]
244+
ds_mean_salary = ds.groupby('Country')['SalaryUSD'].mean().reset_index(name='Mean')
245+
ds_mean_salary.sort_values(by=['Mean'], ascending=False, inplace=True)
246+
ds_mean_salary = ds_mean_salary[(ds_mean_salary['Mean'] <= 280000)]
247+
Top_mean_salary = ds_mean_salary[:10]
248+
249+
fig = px.bar(Top_mean_salary, x='Mean', y='Country', orientation='h',
250+
labels={'Mean': 'Average Salary in US$', 'Country': 'Country'},
251+
title='The Top 10 highest paying data scientist countries ')
252+
253+
fig.update_layout(yaxis={'categoryorder':'total ascending'},
254+
title={'x':0.5, 'xanchor': 'center', 'yanchor': 'top'})
255+
st.plotly_chart(fig)
256+
def plot_value_counts_plotly(column_name, df, column):
257+
values = df[column_name].value_counts()
258+
fig = go.Figure(data=[go.Bar(x=values.index, y=values.values, marker_color=random.choice(['lightseagreen', 'lightgreen', 'lightyellow', 'lightcoral', 'lightsalmon', 'lavender']))])
259+
fig.update_layout(title=f'Value Counts for {column_name}', xaxis_title='Response', yaxis_title='Count')
260+
column.plotly_chart(fig)
261+
262+
def ai_graphs():
263+
st.title('AI Survey Responses')
264+
df = full_data2018[['AIDangerous', 'AIInteresting', 'AIResponsible', 'AIFuture']]
265+
266+
df = df.applymap(lambda x: x.strip() if isinstance(x, str) else x)
267+
268+
short_mapping = {
269+
'Algorithms making important decisions': 'Algorithms',
270+
'Artificial intelligence surpassing human intelligence ("the singularity")': 'AI Singularity',
271+
'Evolving definitions of "fairness" in algorithmic versus human decisions': 'Fairness Evolution',
272+
"Increasing automation of jobs": 'Automation',
273+
"The developers or the people creating the AI": 'Developers',
274+
"A governmental or other regulatory body": 'Government/Regulatory',
275+
"Prominent industry leaders": 'Industry Leaders',
276+
"Nobody": 'No Responsibility',
277+
"I'm excited about the possibilities more than worried about the dangers.": 'Excited about AI Future',
278+
"I'm worried about the dangers more than I'm excited about the possibilities.": 'Worried about AI Future',
279+
"I don't care about it, or I haven't thought about it.": 'Indifferent about AI Future'
280+
}
281+
282+
df.replace(short_mapping, inplace=True)
283+
284+
col1, col2 = st.columns(2)
285+
286+
plot_value_counts_plotly('AIDangerous', df, col1)
287+
plot_value_counts_plotly('AIInteresting', df, col1)
288+
plot_value_counts_plotly('AIResponsible', df, col2)
289+
plot_value_counts_plotly('AIFuture', df, col2)
290+
291+
292+
def result_plot(data):
293+
new_index = data.Rates.sort_values(ascending=False).index
294+
sorted_results = data.reindex(new_index)
295+
296+
filtered_results = sorted_results[np.abs(sorted_results.Rates) > 0.1]
297+
298+
#Plotly figure
299+
fig = px.bar(
300+
filtered_results,
301+
x='Rates',
302+
y='Columns',
303+
orientation='h',
304+
labels={'Rates': 'Negative and Positive Features', 'Columns': 'Features'},
305+
)
306+
307+
fig.update_layout(
308+
xaxis_title='Negative and Positive Features',
309+
yaxis_title='Features',
310+
title_font_size=25,
311+
xaxis_title_font_size=25,
312+
yaxis_title_font_size=25,
313+
height=800,
314+
)
315+
316+
st.plotly_chart(fig, use_container_width=True)

0 commit comments

Comments
 (0)