Skip to content

Commit 904b076

Browse files
committed
Changed dataframe variable to df in all notebooks
1 parent abca08b commit 904b076

5 files changed

+224
-264
lines changed

01-cancer-data-analysis/completed-notebooks/fm-ad-notebook-exploration-COMPLETED.ipynb

+36-61
Original file line numberDiff line numberDiff line change
@@ -389,10 +389,10 @@
389389
" dataframes.append(df)\n",
390390
"\n",
391391
"# Concatenate all the dataframes in the list into a single dataframe\n",
392-
"combined_df = pd.concat(dataframes, ignore_index=True)\n",
392+
"df = pd.concat(dataframes, ignore_index=True)\n",
393393
"\n",
394394
"# Save dataframe to a CSV file\n",
395-
"combined_df.to_csv('combined_data.csv', index=False)"
395+
"df.to_csv('combined_data.csv', index=False)"
396396
]
397397
},
398398
{
@@ -442,7 +442,7 @@
442442
},
443443
{
444444
"cell_type": "code",
445-
"execution_count": 9,
445+
"execution_count": 7,
446446
"metadata": {},
447447
"outputs": [
448448
{
@@ -830,14 +830,14 @@
830830
"4 NaN NaN "
831831
]
832832
},
833-
"execution_count": 9,
833+
"execution_count": 7,
834834
"metadata": {},
835835
"output_type": "execute_result"
836836
}
837837
],
838838
"source": [
839839
"# show first few records of the dataframe\n",
840-
"combined_df.head()\n"
840+
"df.head()"
841841
]
842842
},
843843
{
@@ -856,7 +856,7 @@
856856
},
857857
{
858858
"cell_type": "code",
859-
"execution_count": 10,
859+
"execution_count": 8,
860860
"metadata": {},
861861
"outputs": [
862862
{
@@ -913,7 +913,7 @@
913913
],
914914
"source": [
915915
"# get an overview of the dataframe\n",
916-
"combined_df.info()"
916+
"df.info()"
917917
]
918918
},
919919
{
@@ -925,7 +925,7 @@
925925
},
926926
{
927927
"cell_type": "code",
928-
"execution_count": 13,
928+
"execution_count": 9,
929929
"metadata": {},
930930
"outputs": [
931931
{
@@ -934,17 +934,14 @@
934934
"(72016, 38)"
935935
]
936936
},
937-
"execution_count": 13,
937+
"execution_count": 9,
938938
"metadata": {},
939939
"output_type": "execute_result"
940940
}
941941
],
942942
"source": [
943943
"# show the dataframe's dimensions\n",
944-
"combined_df.shape\n",
945-
"\n",
946-
"\n",
947-
"\n"
944+
"df.shape"
948945
]
949946
},
950947
{
@@ -956,7 +953,7 @@
956953
},
957954
{
958955
"cell_type": "code",
959-
"execution_count": 12,
956+
"execution_count": 10,
960957
"metadata": {},
961958
"outputs": [
962959
{
@@ -1003,14 +1000,14 @@
10031000
"dtype: object"
10041001
]
10051002
},
1006-
"execution_count": 12,
1003+
"execution_count": 10,
10071004
"metadata": {},
10081005
"output_type": "execute_result"
10091006
}
10101007
],
10111008
"source": [
10121009
"# show the columns and their data types\n",
1013-
"combined_df.dtypes"
1010+
"df.dtypes"
10141011
]
10151012
},
10161013
{
@@ -1024,7 +1021,7 @@
10241021
},
10251022
{
10261023
"cell_type": "code",
1027-
"execution_count": 14,
1024+
"execution_count": 11,
10281025
"metadata": {},
10291026
"outputs": [
10301027
{
@@ -1128,14 +1125,14 @@
11281125
"max 49.0 "
11291126
]
11301127
},
1131-
"execution_count": 14,
1128+
"execution_count": 11,
11321129
"metadata": {},
11331130
"output_type": "execute_result"
11341131
}
11351132
],
11361133
"source": [
11371134
"# show descriptive statistics of the dataframe\n",
1138-
"combined_df.describe()"
1135+
"df.describe()"
11391136
]
11401137
},
11411138
{
@@ -1154,7 +1151,7 @@
11541151
},
11551152
{
11561153
"cell_type": "code",
1157-
"execution_count": 16,
1154+
"execution_count": 12,
11581155
"metadata": {},
11591156
"outputs": [
11601157
{
@@ -1201,36 +1198,14 @@
12011198
"dtype: int64"
12021199
]
12031200
},
1204-
"execution_count": 16,
1201+
"execution_count": 12,
12051202
"metadata": {},
12061203
"output_type": "execute_result"
12071204
}
12081205
],
12091206
"source": [
12101207
"# show the number of missing values in each column in descending order\n",
1211-
"combined_df.isnull().sum().sort_values(ascending=False)\n"
1212-
]
1213-
},
1214-
{
1215-
"cell_type": "code",
1216-
"execution_count": 17,
1217-
"metadata": {},
1218-
"outputs": [
1219-
{
1220-
"data": {
1221-
"text/plain": [
1222-
"['cases.submitter_id', 'case_id', 'cases.disease_type', 'cases.primary_site']"
1223-
]
1224-
},
1225-
"execution_count": 17,
1226-
"metadata": {},
1227-
"output_type": "execute_result"
1228-
}
1229-
],
1230-
"source": [
1231-
"# show columns that start with 'case'\n",
1232-
"case_columns = [col for col in combined_df.columns if col.startswith('case')]\n",
1233-
"case_columns"
1208+
"df.isnull().sum().sort_values(ascending=False)"
12341209
]
12351210
},
12361211
{
@@ -1249,7 +1224,7 @@
12491224
},
12501225
{
12511226
"cell_type": "code",
1252-
"execution_count": 19,
1227+
"execution_count": 13,
12531228
"metadata": {
12541229
"slideshow": {
12551230
"slide_type": "slide"
@@ -1266,14 +1241,14 @@
12661241
"dtype: int64"
12671242
]
12681243
},
1269-
"execution_count": 19,
1244+
"execution_count": 13,
12701245
"metadata": {},
12711246
"output_type": "execute_result"
12721247
}
12731248
],
12741249
"source": [
12751250
"# show which columns have the value 'Unknown' in them and show how many each column has in descending order\n",
1276-
"unknown_values = combined_df.isin(['Unknown']).sum().sort_values(ascending=False)\n",
1251+
"unknown_values = df.isin(['Unknown']).sum().sort_values(ascending=False)\n",
12771252
"unknown_values[unknown_values > 0]"
12781253
]
12791254
},
@@ -1293,7 +1268,7 @@
12931268
},
12941269
{
12951270
"cell_type": "code",
1296-
"execution_count": 21,
1271+
"execution_count": 14,
12971272
"metadata": {},
12981273
"outputs": [
12991274
{
@@ -1340,14 +1315,14 @@
13401315
"dtype: int64"
13411316
]
13421317
},
1343-
"execution_count": 21,
1318+
"execution_count": 14,
13441319
"metadata": {},
13451320
"output_type": "execute_result"
13461321
}
13471322
],
13481323
"source": [
13491324
"# show the number of unique values in each column in descending order\n",
1350-
"unique_values = combined_df.nunique().sort_values(ascending=False)\n",
1325+
"unique_values = df.nunique().sort_values(ascending=False)\n",
13511326
"unique_values"
13521327
]
13531328
},
@@ -1360,7 +1335,7 @@
13601335
},
13611336
{
13621337
"cell_type": "code",
1363-
"execution_count": 22,
1338+
"execution_count": 15,
13641339
"metadata": {},
13651340
"outputs": [
13661341
{
@@ -1400,7 +1375,7 @@
14001375
"# show 5 unique values of columns with unique values less than 100\n",
14011376
"for col, n_unique in unique_values.items():\n",
14021377
" if n_unique < 100:\n",
1403-
" unique_vals = combined_df[col].unique()\n",
1378+
" unique_vals = df[col].unique()\n",
14041379
" print(f\"{col}: {unique_vals[:5]}\")"
14051380
]
14061381
},
@@ -1420,7 +1395,7 @@
14201395
},
14211396
{
14221397
"cell_type": "code",
1423-
"execution_count": 24,
1398+
"execution_count": 16,
14241399
"metadata": {},
14251400
"outputs": [
14261401
{
@@ -1429,14 +1404,14 @@
14291404
"36008"
14301405
]
14311406
},
1432-
"execution_count": 24,
1407+
"execution_count": 16,
14331408
"metadata": {},
14341409
"output_type": "execute_result"
14351410
}
14361411
],
14371412
"source": [
14381413
"# show the number of duplicate records in the dataframe\n",
1439-
"n_duplicates = combined_df.duplicated().sum()\n",
1414+
"n_duplicates = df.duplicated().sum()\n",
14401415
"n_duplicates"
14411416
]
14421417
},
@@ -1451,7 +1426,7 @@
14511426
},
14521427
{
14531428
"cell_type": "code",
1454-
"execution_count": 28,
1429+
"execution_count": 17,
14551430
"metadata": {},
14561431
"outputs": [
14571432
{
@@ -19465,14 +19440,14 @@
1946519440
"Name: count, dtype: int64"
1946619441
]
1946719442
},
19468-
"execution_count": 28,
19443+
"execution_count": 17,
1946919444
"metadata": {},
1947019445
"output_type": "execute_result"
1947119446
}
1947219447
],
1947319448
"source": [
1947419449
"# count how many records share the same case_id\n",
19475-
"case_id_counts = combined_df['case_id'].value_counts()\n",
19450+
"case_id_counts = df['case_id'].value_counts()\n",
1947619451
"case_id_counts"
1947719452
]
1947819453
},
@@ -19485,7 +19460,7 @@
1948519460
},
1948619461
{
1948719462
"cell_type": "code",
19488-
"execution_count": 29,
19463+
"execution_count": 18,
1948919464
"metadata": {},
1949019465
"outputs": [
1949119466
{
@@ -19834,7 +19809,7 @@
1983419809
"36299 NaN NaN "
1983519810
]
1983619811
},
19837-
"execution_count": 29,
19812+
"execution_count": 18,
1983819813
"metadata": {},
1983919814
"output_type": "execute_result"
1984019815
}
@@ -19843,7 +19818,7 @@
1984319818
"# show the records with the case_id 40e57344-a8ad-4de4-92e4-6e681c0593b7\n",
1984419819
"case_id = '40e57344-a8ad-4de4-92e4-6e681c0593b7'\n",
1984519820
"\n",
19846-
"combined_df[combined_df['case_id'] == '40e57344-a8ad-4de4-92e4-6e681c0593b7']"
19821+
"df[df['case_id'] == '40e57344-a8ad-4de4-92e4-6e681c0593b7']"
1984719822
]
1984819823
},
1984919824
{

0 commit comments

Comments
 (0)