Internet2
diff --git a/‎01-cancer-data-analysis/completed-notebooks/fm-ad-notebook-exploration-COMPLETED.ipynb
Lines changed: 36 additions & 61 deletions b/‎01-cancer-data-analysis/completed-notebooks/fm-ad-notebook-exploration-COMPLETED.ipynb
Lines changed: 36 additions & 61 deletions
@@ -389,10 +389,10 @@
     "        dataframes.append(df)\n",
     "\n",
     "# Concatenate all the dataframes in the list into a single dataframe\n",
-    "combined_df = pd.concat(dataframes, ignore_index=True)\n",
+    "df = pd.concat(dataframes, ignore_index=True)\n",
     "\n",
     "# Save dataframe to a CSV file\n",
-    "combined_df.to_csv('combined_data.csv', index=False)"
+    "df.to_csv('combined_data.csv', index=False)"
    ]
   },
   {
@@ -442,7 +442,7 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 9,
+   "execution_count": 7,
    "metadata": {},
    "outputs": [
     {
@@ -830,14 +830,14 @@
        "4                      NaN                         NaN  "
       ]
      },
-     "execution_count": 9,
+     "execution_count": 7,
      "metadata": {},
      "output_type": "execute_result"
     }
    ],
    "source": [
     "# show first few records of the dataframe\n",
-    "combined_df.head()\n"
+    "df.head()"
    ]
   },
   {
@@ -856,7 +856,7 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 10,
+   "execution_count": 8,
    "metadata": {},
    "outputs": [
     {
@@ -913,7 +913,7 @@
    ],
    "source": [
     "# get an overview of the dataframe\n",
-    "combined_df.info()"
+    "df.info()"
    ]
   },
   {
@@ -925,7 +925,7 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 13,
+   "execution_count": 9,
    "metadata": {},
    "outputs": [
     {
@@ -934,17 +934,14 @@
        "(72016, 38)"
       ]
      },
-     "execution_count": 13,
+     "execution_count": 9,
      "metadata": {},
      "output_type": "execute_result"
     }
    ],
    "source": [
     "# show the dataframe's dimensions\n",
-    "combined_df.shape\n",
-    "\n",
-    "\n",
-    "\n"
+    "df.shape"
    ]
   },
   {
@@ -956,7 +953,7 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 12,
+   "execution_count": 10,
    "metadata": {},
    "outputs": [
     {
@@ -1003,14 +1000,14 @@
        "dtype: object"
       ]
      },
-     "execution_count": 12,
+     "execution_count": 10,
      "metadata": {},
      "output_type": "execute_result"
     }
    ],
    "source": [
     "# show the columns and their data types\n",
-    "combined_df.dtypes"
+    "df.dtypes"
    ]
   },
   {
@@ -1024,7 +1021,7 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 14,
+   "execution_count": 11,
    "metadata": {},
    "outputs": [
     {
@@ -1128,14 +1125,14 @@
        "max                       49.0  "
       ]
      },
-     "execution_count": 14,
+     "execution_count": 11,
      "metadata": {},
      "output_type": "execute_result"
     }
    ],
    "source": [
     "# show descriptive statistics of the dataframe\n",
-    "combined_df.describe()"
+    "df.describe()"
    ]
   },
   {
@@ -1154,7 +1151,7 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 16,
+   "execution_count": 12,
    "metadata": {},
    "outputs": [
     {
@@ -1201,36 +1198,14 @@
        "dtype: int64"
       ]
      },
-     "execution_count": 16,
+     "execution_count": 12,
      "metadata": {},
      "output_type": "execute_result"
     }
    ],
    "source": [
     "# show the number of missing values in each column in descending order\n",
-    "combined_df.isnull().sum().sort_values(ascending=False)\n"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": 17,
-   "metadata": {},
-   "outputs": [
-    {
-     "data": {
-      "text/plain": [
-       "['cases.submitter_id', 'case_id', 'cases.disease_type', 'cases.primary_site']"
-      ]
-     },
-     "execution_count": 17,
-     "metadata": {},
-     "output_type": "execute_result"
-    }
-   ],
-   "source": [
-    "# show columns that start with 'case'\n",
-    "case_columns = [col for col in combined_df.columns if col.startswith('case')]\n",
-    "case_columns"
+    "df.isnull().sum().sort_values(ascending=False)"
    ]
   },
   {
@@ -1249,7 +1224,7 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 19,
+   "execution_count": 13,
    "metadata": {
     "slideshow": {
      "slide_type": "slide"
@@ -1266,14 +1241,14 @@
        "dtype: int64"
       ]
      },
-     "execution_count": 19,
+     "execution_count": 13,
      "metadata": {},
      "output_type": "execute_result"
     }
    ],
    "source": [
     "# show which columns have the value 'Unknown' in them and show how many each column has in descending order\n",
-    "unknown_values = combined_df.isin(['Unknown']).sum().sort_values(ascending=False)\n",
+    "unknown_values = df.isin(['Unknown']).sum().sort_values(ascending=False)\n",
     "unknown_values[unknown_values > 0]"
    ]
   },
@@ -1293,7 +1268,7 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 21,
+   "execution_count": 14,
    "metadata": {},
    "outputs": [
     {
@@ -1340,14 +1315,14 @@
        "dtype: int64"
       ]
      },
-     "execution_count": 21,
+     "execution_count": 14,
      "metadata": {},
      "output_type": "execute_result"
     }
    ],
    "source": [
     "# show the number of unique values in each column in descending order\n",
-    "unique_values = combined_df.nunique().sort_values(ascending=False)\n",
+    "unique_values = df.nunique().sort_values(ascending=False)\n",
     "unique_values"
    ]
   },
@@ -1360,7 +1335,7 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 22,
+   "execution_count": 15,
    "metadata": {},
    "outputs": [
     {
@@ -1400,7 +1375,7 @@
     "# show 5 unique values of columns with unique values less than 100\n",
     "for col, n_unique in unique_values.items():\n",
     "    if n_unique < 100:\n",
-    "        unique_vals = combined_df[col].unique()\n",
+    "        unique_vals = df[col].unique()\n",
     "        print(f\"{col}: {unique_vals[:5]}\")"
    ]
   },
@@ -1420,7 +1395,7 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 24,
+   "execution_count": 16,
    "metadata": {},
    "outputs": [
     {
@@ -1429,14 +1404,14 @@
        "36008"
       ]
      },
-     "execution_count": 24,
+     "execution_count": 16,
      "metadata": {},
      "output_type": "execute_result"
     }
    ],
    "source": [
     "# show the number of duplicate records in the dataframe\n",
-    "n_duplicates = combined_df.duplicated().sum()\n",
+    "n_duplicates = df.duplicated().sum()\n",
     "n_duplicates"
    ]
   },
@@ -1451,7 +1426,7 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 28,
+   "execution_count": 17,
    "metadata": {},
    "outputs": [
     {
@@ -19465,14 +19440,14 @@
        "Name: count, dtype: int64"
       ]
      },
-     "execution_count": 28,
+     "execution_count": 17,
      "metadata": {},
      "output_type": "execute_result"
     }
    ],
    "source": [
     "# count how many records share the same case_id\n",
-    "case_id_counts = combined_df['case_id'].value_counts()\n",
+    "case_id_counts = df['case_id'].value_counts()\n",
     "case_id_counts"
    ]
   },
@@ -19485,7 +19460,7 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 29,
+   "execution_count": 18,
    "metadata": {},
    "outputs": [
     {
@@ -19834,7 +19809,7 @@
        "36299                      NaN                         NaN  "
       ]
      },
-     "execution_count": 29,
+     "execution_count": 18,
      "metadata": {},
      "output_type": "execute_result"
     }
@@ -19843,7 +19818,7 @@
     "# show the records with the case_id 40e57344-a8ad-4de4-92e4-6e681c0593b7\n",
     "case_id = '40e57344-a8ad-4de4-92e4-6e681c0593b7'\n",
     "\n",
-    "combined_df[combined_df['case_id'] == '40e57344-a8ad-4de4-92e4-6e681c0593b7']"
+    "df[df['case_id'] == '40e57344-a8ad-4de4-92e4-6e681c0593b7']"
    ]
   },
   {
Original file line number	Diff line number	Diff line change
`@@ -389,10 +389,10 @@`
`389`	`389`	`" dataframes.append(df)\n",`
`390`	`390`	`"\n",`
`391`	`391`	`"# Concatenate all the dataframes in the list into a single dataframe\n",`
`392`		`- "combined_df = pd.concat(dataframes, ignore_index=True)\n",`
	`392`	`+ "df = pd.concat(dataframes, ignore_index=True)\n",`
`393`	`393`	`"\n",`
`394`	`394`	`"# Save dataframe to a CSV file\n",`
`395`		`- "combined_df.to_csv('combined_data.csv', index=False)"`
	`395`	`+ "df.to_csv('combined_data.csv', index=False)"`
`396`	`396`	`]`
`397`	`397`	`},`
`398`	`398`	`{`
`@@ -442,7 +442,7 @@`
`442`	`442`	`},`
`443`	`443`	`{`
`444`	`444`	`"cell_type": "code",`
`445`		`- "execution_count": 9,`
	`445`	`+ "execution_count": 7,`
`446`	`446`	`"metadata": {},`
`447`	`447`	`"outputs": [`
`448`	`448`	`{`
`@@ -830,14 +830,14 @@`
`830`	`830`	`"4 NaN NaN "`
`831`	`831`	`]`
`832`	`832`	`},`
`833`		`- "execution_count": 9,`
	`833`	`+ "execution_count": 7,`
`834`	`834`	`"metadata": {},`
`835`	`835`	`"output_type": "execute_result"`
`836`	`836`	`}`
`837`	`837`	`],`
`838`	`838`	`"source": [`
`839`	`839`	`"# show first few records of the dataframe\n",`
`840`		`- "combined_df.head()\n"`
	`840`	`+ "df.head()"`
`841`	`841`	`]`
`842`	`842`	`},`
`843`	`843`	`{`
`@@ -856,7 +856,7 @@`
`856`	`856`	`},`
`857`	`857`	`{`
`858`	`858`	`"cell_type": "code",`
`859`		`- "execution_count": 10,`
	`859`	`+ "execution_count": 8,`
`860`	`860`	`"metadata": {},`
`861`	`861`	`"outputs": [`
`862`	`862`	`{`
`@@ -913,7 +913,7 @@`
`913`	`913`	`],`
`914`	`914`	`"source": [`
`915`	`915`	`"# get an overview of the dataframe\n",`
`916`		`- "combined_df.info()"`
	`916`	`+ "df.info()"`
`917`	`917`	`]`
`918`	`918`	`},`
`919`	`919`	`{`
`@@ -925,7 +925,7 @@`
`925`	`925`	`},`
`926`	`926`	`{`
`927`	`927`	`"cell_type": "code",`
`928`		`- "execution_count": 13,`
	`928`	`+ "execution_count": 9,`
`929`	`929`	`"metadata": {},`
`930`	`930`	`"outputs": [`
`931`	`931`	`{`
`@@ -934,17 +934,14 @@`
`934`	`934`	`"(72016, 38)"`
`935`	`935`	`]`
`936`	`936`	`},`
`937`		`- "execution_count": 13,`
	`937`	`+ "execution_count": 9,`
`938`	`938`	`"metadata": {},`
`939`	`939`	`"output_type": "execute_result"`
`940`	`940`	`}`
`941`	`941`	`],`
`942`	`942`	`"source": [`
`943`	`943`	`"# show the dataframe's dimensions\n",`
`944`		`- "combined_df.shape\n",`
`945`		`- "\n",`
`946`		`- "\n",`
`947`		`- "\n"`
	`944`	`+ "df.shape"`
`948`	`945`	`]`
`949`	`946`	`},`
`950`	`947`	`{`
`@@ -956,7 +953,7 @@`
`956`	`953`	`},`
`957`	`954`	`{`
`958`	`955`	`"cell_type": "code",`
`959`		`- "execution_count": 12,`
	`956`	`+ "execution_count": 10,`
`960`	`957`	`"metadata": {},`
`961`	`958`	`"outputs": [`
`962`	`959`	`{`
`@@ -1003,14 +1000,14 @@`
`1003`	`1000`	`"dtype: object"`
`1004`	`1001`	`]`
`1005`	`1002`	`},`
`1006`		`- "execution_count": 12,`
	`1003`	`+ "execution_count": 10,`
`1007`	`1004`	`"metadata": {},`
`1008`	`1005`	`"output_type": "execute_result"`
`1009`	`1006`	`}`
`1010`	`1007`	`],`
`1011`	`1008`	`"source": [`
`1012`	`1009`	`"# show the columns and their data types\n",`
`1013`		`- "combined_df.dtypes"`
	`1010`	`+ "df.dtypes"`
`1014`	`1011`	`]`
`1015`	`1012`	`},`
`1016`	`1013`	`{`
`@@ -1024,7 +1021,7 @@`
`1024`	`1021`	`},`
`1025`	`1022`	`{`
`1026`	`1023`	`"cell_type": "code",`
`1027`		`- "execution_count": 14,`
	`1024`	`+ "execution_count": 11,`
`1028`	`1025`	`"metadata": {},`
`1029`	`1026`	`"outputs": [`
`1030`	`1027`	`{`
`@@ -1128,14 +1125,14 @@`
`1128`	`1125`	`"max 49.0 "`
`1129`	`1126`	`]`
`1130`	`1127`	`},`
`1131`		`- "execution_count": 14,`
	`1128`	`+ "execution_count": 11,`
`1132`	`1129`	`"metadata": {},`
`1133`	`1130`	`"output_type": "execute_result"`
`1134`	`1131`	`}`
`1135`	`1132`	`],`
`1136`	`1133`	`"source": [`
`1137`	`1134`	`"# show descriptive statistics of the dataframe\n",`
`1138`		`- "combined_df.describe()"`
	`1135`	`+ "df.describe()"`
`1139`	`1136`	`]`
`1140`	`1137`	`},`
`1141`	`1138`	`{`
`@@ -1154,7 +1151,7 @@`
`1154`	`1151`	`},`
`1155`	`1152`	`{`
`1156`	`1153`	`"cell_type": "code",`
`1157`		`- "execution_count": 16,`
	`1154`	`+ "execution_count": 12,`
`1158`	`1155`	`"metadata": {},`
`1159`	`1156`	`"outputs": [`
`1160`	`1157`	`{`
`@@ -1201,36 +1198,14 @@`
`1201`	`1198`	`"dtype: int64"`
`1202`	`1199`	`]`
`1203`	`1200`	`},`
`1204`		`- "execution_count": 16,`
	`1201`	`+ "execution_count": 12,`
`1205`	`1202`	`"metadata": {},`
`1206`	`1203`	`"output_type": "execute_result"`
`1207`	`1204`	`}`
`1208`	`1205`	`],`
`1209`	`1206`	`"source": [`
`1210`	`1207`	`"# show the number of missing values in each column in descending order\n",`
`1211`		`- "combined_df.isnull().sum().sort_values(ascending=False)\n"`
`1212`		`- ]`
`1213`		`- },`
`1214`		`- {`
`1215`		`- "cell_type": "code",`
`1216`		`- "execution_count": 17,`
`1217`		`- "metadata": {},`
`1218`		`- "outputs": [`
`1219`		`- {`
`1220`		`- "data": {`
`1221`		`- "text/plain": [`
`1222`		`- "['cases.submitter_id', 'case_id', 'cases.disease_type', 'cases.primary_site']"`
`1223`		`- ]`
`1224`		`- },`
`1225`		`- "execution_count": 17,`
`1226`		`- "metadata": {},`
`1227`		`- "output_type": "execute_result"`
`1228`		`- }`
`1229`		`- ],`
`1230`		`- "source": [`
`1231`		`- "# show columns that start with 'case'\n",`
`1232`		`- "case_columns = [col for col in combined_df.columns if col.startswith('case')]\n",`
`1233`		`- "case_columns"`
	`1208`	`+ "df.isnull().sum().sort_values(ascending=False)"`
`1234`	`1209`	`]`
`1235`	`1210`	`},`
`1236`	`1211`	`{`
`@@ -1249,7 +1224,7 @@`
`1249`	`1224`	`},`
`1250`	`1225`	`{`
`1251`	`1226`	`"cell_type": "code",`
`1252`		`- "execution_count": 19,`
	`1227`	`+ "execution_count": 13,`
`1253`	`1228`	`"metadata": {`
`1254`	`1229`	`"slideshow": {`
`1255`	`1230`	`"slide_type": "slide"`
`@@ -1266,14 +1241,14 @@`
`1266`	`1241`	`"dtype: int64"`
`1267`	`1242`	`]`
`1268`	`1243`	`},`
`1269`		`- "execution_count": 19,`
	`1244`	`+ "execution_count": 13,`
`1270`	`1245`	`"metadata": {},`
`1271`	`1246`	`"output_type": "execute_result"`
`1272`	`1247`	`}`
`1273`	`1248`	`],`
`1274`	`1249`	`"source": [`
`1275`	`1250`	`"# show which columns have the value 'Unknown' in them and show how many each column has in descending order\n",`
`1276`		`- "unknown_values = combined_df.isin(['Unknown']).sum().sort_values(ascending=False)\n",`
	`1251`	`+ "unknown_values = df.isin(['Unknown']).sum().sort_values(ascending=False)\n",`
`1277`	`1252`	`"unknown_values[unknown_values > 0]"`
`1278`	`1253`	`]`
`1279`	`1254`	`},`
`@@ -1293,7 +1268,7 @@`
`1293`	`1268`	`},`
`1294`	`1269`	`{`
`1295`	`1270`	`"cell_type": "code",`
`1296`		`- "execution_count": 21,`
	`1271`	`+ "execution_count": 14,`
`1297`	`1272`	`"metadata": {},`
`1298`	`1273`	`"outputs": [`
`1299`	`1274`	`{`
`@@ -1340,14 +1315,14 @@`
`1340`	`1315`	`"dtype: int64"`
`1341`	`1316`	`]`
`1342`	`1317`	`},`
`1343`		`- "execution_count": 21,`
	`1318`	`+ "execution_count": 14,`
`1344`	`1319`	`"metadata": {},`
`1345`	`1320`	`"output_type": "execute_result"`
`1346`	`1321`	`}`
`1347`	`1322`	`],`
`1348`	`1323`	`"source": [`
`1349`	`1324`	`"# show the number of unique values in each column in descending order\n",`
`1350`		`- "unique_values = combined_df.nunique().sort_values(ascending=False)\n",`
	`1325`	`+ "unique_values = df.nunique().sort_values(ascending=False)\n",`
`1351`	`1326`	`"unique_values"`
`1352`	`1327`	`]`
`1353`	`1328`	`},`
`@@ -1360,7 +1335,7 @@`
`1360`	`1335`	`},`
`1361`	`1336`	`{`
`1362`	`1337`	`"cell_type": "code",`
`1363`		`- "execution_count": 22,`
	`1338`	`+ "execution_count": 15,`
`1364`	`1339`	`"metadata": {},`
`1365`	`1340`	`"outputs": [`
`1366`	`1341`	`{`
`@@ -1400,7 +1375,7 @@`
`1400`	`1375`	`"# show 5 unique values of columns with unique values less than 100\n",`
`1401`	`1376`	`"for col, n_unique in unique_values.items():\n",`
`1402`	`1377`	`" if n_unique < 100:\n",`
`1403`		`- " unique_vals = combined_df[col].unique()\n",`
	`1378`	`+ " unique_vals = df[col].unique()\n",`
`1404`	`1379`	`" print(f\"{col}: {unique_vals[:5]}\")"`
`1405`	`1380`	`]`
`1406`	`1381`	`},`
`@@ -1420,7 +1395,7 @@`
`1420`	`1395`	`},`
`1421`	`1396`	`{`
`1422`	`1397`	`"cell_type": "code",`
`1423`		`- "execution_count": 24,`
	`1398`	`+ "execution_count": 16,`
`1424`	`1399`	`"metadata": {},`
`1425`	`1400`	`"outputs": [`
`1426`	`1401`	`{`
`@@ -1429,14 +1404,14 @@`
`1429`	`1404`	`"36008"`
`1430`	`1405`	`]`
`1431`	`1406`	`},`
`1432`		`- "execution_count": 24,`
	`1407`	`+ "execution_count": 16,`
`1433`	`1408`	`"metadata": {},`
`1434`	`1409`	`"output_type": "execute_result"`
`1435`	`1410`	`}`
`1436`	`1411`	`],`
`1437`	`1412`	`"source": [`
`1438`	`1413`	`"# show the number of duplicate records in the dataframe\n",`
`1439`		`- "n_duplicates = combined_df.duplicated().sum()\n",`
	`1414`	`+ "n_duplicates = df.duplicated().sum()\n",`
`1440`	`1415`	`"n_duplicates"`
`1441`	`1416`	`]`
`1442`	`1417`	`},`
`@@ -1451,7 +1426,7 @@`
`1451`	`1426`	`},`
`1452`	`1427`	`{`
`1453`	`1428`	`"cell_type": "code",`
`1454`		`- "execution_count": 28,`
	`1429`	`+ "execution_count": 17,`
`1455`	`1430`	`"metadata": {},`
`1456`	`1431`	`"outputs": [`
`1457`	`1432`	`{`
`@@ -19465,14 +19440,14 @@`
`19465`	`19440`	`"Name: count, dtype: int64"`
`19466`	`19441`	`]`
`19467`	`19442`	`},`
`19468`		`- "execution_count": 28,`
	`19443`	`+ "execution_count": 17,`
`19469`	`19444`	`"metadata": {},`
`19470`	`19445`	`"output_type": "execute_result"`
`19471`	`19446`	`}`
`19472`	`19447`	`],`
`19473`	`19448`	`"source": [`
`19474`	`19449`	`"# count how many records share the same case_id\n",`
`19475`		`- "case_id_counts = combined_df['case_id'].value_counts()\n",`
	`19450`	`+ "case_id_counts = df['case_id'].value_counts()\n",`
`19476`	`19451`	`"case_id_counts"`
`19477`	`19452`	`]`
`19478`	`19453`	`},`
`@@ -19485,7 +19460,7 @@`
`19485`	`19460`	`},`
`19486`	`19461`	`{`
`19487`	`19462`	`"cell_type": "code",`
`19488`		`- "execution_count": 29,`
	`19463`	`+ "execution_count": 18,`
`19489`	`19464`	`"metadata": {},`
`19490`	`19465`	`"outputs": [`
`19491`	`19466`	`{`
`@@ -19834,7 +19809,7 @@`
`19834`	`19809`	`"36299 NaN NaN "`
`19835`	`19810`	`]`
`19836`	`19811`	`},`
`19837`		`- "execution_count": 29,`
	`19812`	`+ "execution_count": 18,`
`19838`	`19813`	`"metadata": {},`
`19839`	`19814`	`"output_type": "execute_result"`
`19840`	`19815`	`}`
`@@ -19843,7 +19818,7 @@`
`19843`	`19818`	`"# show the records with the case_id 40e57344-a8ad-4de4-92e4-6e681c0593b7\n",`
`19844`	`19819`	`"case_id = '40e57344-a8ad-4de4-92e4-6e681c0593b7'\n",`
`19845`	`19820`	`"\n",`
`19846`		`- "combined_df[combined_df['case_id'] == '40e57344-a8ad-4de4-92e4-6e681c0593b7']"`
	`19821`	`+ "df[df['case_id'] == '40e57344-a8ad-4de4-92e4-6e681c0593b7']"`
`19847`	`19822`	`]`
`19848`	`19823`	`},`
`19849`	`19824`	`{`