|
389 | 389 | " dataframes.append(df)\n",
|
390 | 390 | "\n",
|
391 | 391 | "# Concatenate all the dataframes in the list into a single dataframe\n",
|
392 |
| - "combined_df = pd.concat(dataframes, ignore_index=True)\n", |
| 392 | + "df = pd.concat(dataframes, ignore_index=True)\n", |
393 | 393 | "\n",
|
394 | 394 | "# Save dataframe to a CSV file\n",
|
395 |
| - "combined_df.to_csv('combined_data.csv', index=False)" |
| 395 | + "df.to_csv('combined_data.csv', index=False)" |
396 | 396 | ]
|
397 | 397 | },
|
398 | 398 | {
|
|
442 | 442 | },
|
443 | 443 | {
|
444 | 444 | "cell_type": "code",
|
445 |
| - "execution_count": 9, |
| 445 | + "execution_count": 7, |
446 | 446 | "metadata": {},
|
447 | 447 | "outputs": [
|
448 | 448 | {
|
|
830 | 830 | "4 NaN NaN "
|
831 | 831 | ]
|
832 | 832 | },
|
833 |
| - "execution_count": 9, |
| 833 | + "execution_count": 7, |
834 | 834 | "metadata": {},
|
835 | 835 | "output_type": "execute_result"
|
836 | 836 | }
|
837 | 837 | ],
|
838 | 838 | "source": [
|
839 | 839 | "# show first few records of the dataframe\n",
|
840 |
| - "combined_df.head()\n" |
| 840 | + "df.head()" |
841 | 841 | ]
|
842 | 842 | },
|
843 | 843 | {
|
|
856 | 856 | },
|
857 | 857 | {
|
858 | 858 | "cell_type": "code",
|
859 |
| - "execution_count": 10, |
| 859 | + "execution_count": 8, |
860 | 860 | "metadata": {},
|
861 | 861 | "outputs": [
|
862 | 862 | {
|
|
913 | 913 | ],
|
914 | 914 | "source": [
|
915 | 915 | "# get an overview of the dataframe\n",
|
916 |
| - "combined_df.info()" |
| 916 | + "df.info()" |
917 | 917 | ]
|
918 | 918 | },
|
919 | 919 | {
|
|
925 | 925 | },
|
926 | 926 | {
|
927 | 927 | "cell_type": "code",
|
928 |
| - "execution_count": 13, |
| 928 | + "execution_count": 9, |
929 | 929 | "metadata": {},
|
930 | 930 | "outputs": [
|
931 | 931 | {
|
|
934 | 934 | "(72016, 38)"
|
935 | 935 | ]
|
936 | 936 | },
|
937 |
| - "execution_count": 13, |
| 937 | + "execution_count": 9, |
938 | 938 | "metadata": {},
|
939 | 939 | "output_type": "execute_result"
|
940 | 940 | }
|
941 | 941 | ],
|
942 | 942 | "source": [
|
943 | 943 | "# show the dataframe's dimensions\n",
|
944 |
| - "combined_df.shape\n", |
945 |
| - "\n", |
946 |
| - "\n", |
947 |
| - "\n" |
| 944 | + "df.shape" |
948 | 945 | ]
|
949 | 946 | },
|
950 | 947 | {
|
|
956 | 953 | },
|
957 | 954 | {
|
958 | 955 | "cell_type": "code",
|
959 |
| - "execution_count": 12, |
| 956 | + "execution_count": 10, |
960 | 957 | "metadata": {},
|
961 | 958 | "outputs": [
|
962 | 959 | {
|
|
1003 | 1000 | "dtype: object"
|
1004 | 1001 | ]
|
1005 | 1002 | },
|
1006 |
| - "execution_count": 12, |
| 1003 | + "execution_count": 10, |
1007 | 1004 | "metadata": {},
|
1008 | 1005 | "output_type": "execute_result"
|
1009 | 1006 | }
|
1010 | 1007 | ],
|
1011 | 1008 | "source": [
|
1012 | 1009 | "# show the columns and their data types\n",
|
1013 |
| - "combined_df.dtypes" |
| 1010 | + "df.dtypes" |
1014 | 1011 | ]
|
1015 | 1012 | },
|
1016 | 1013 | {
|
|
1024 | 1021 | },
|
1025 | 1022 | {
|
1026 | 1023 | "cell_type": "code",
|
1027 |
| - "execution_count": 14, |
| 1024 | + "execution_count": 11, |
1028 | 1025 | "metadata": {},
|
1029 | 1026 | "outputs": [
|
1030 | 1027 | {
|
|
1128 | 1125 | "max 49.0 "
|
1129 | 1126 | ]
|
1130 | 1127 | },
|
1131 |
| - "execution_count": 14, |
| 1128 | + "execution_count": 11, |
1132 | 1129 | "metadata": {},
|
1133 | 1130 | "output_type": "execute_result"
|
1134 | 1131 | }
|
1135 | 1132 | ],
|
1136 | 1133 | "source": [
|
1137 | 1134 | "# show descriptive statistics of the dataframe\n",
|
1138 |
| - "combined_df.describe()" |
| 1135 | + "df.describe()" |
1139 | 1136 | ]
|
1140 | 1137 | },
|
1141 | 1138 | {
|
|
1154 | 1151 | },
|
1155 | 1152 | {
|
1156 | 1153 | "cell_type": "code",
|
1157 |
| - "execution_count": 16, |
| 1154 | + "execution_count": 12, |
1158 | 1155 | "metadata": {},
|
1159 | 1156 | "outputs": [
|
1160 | 1157 | {
|
|
1201 | 1198 | "dtype: int64"
|
1202 | 1199 | ]
|
1203 | 1200 | },
|
1204 |
| - "execution_count": 16, |
| 1201 | + "execution_count": 12, |
1205 | 1202 | "metadata": {},
|
1206 | 1203 | "output_type": "execute_result"
|
1207 | 1204 | }
|
1208 | 1205 | ],
|
1209 | 1206 | "source": [
|
1210 | 1207 | "# show the number of missing values in each column in descending order\n",
|
1211 |
| - "combined_df.isnull().sum().sort_values(ascending=False)\n" |
1212 |
| - ] |
1213 |
| - }, |
1214 |
| - { |
1215 |
| - "cell_type": "code", |
1216 |
| - "execution_count": 17, |
1217 |
| - "metadata": {}, |
1218 |
| - "outputs": [ |
1219 |
| - { |
1220 |
| - "data": { |
1221 |
| - "text/plain": [ |
1222 |
| - "['cases.submitter_id', 'case_id', 'cases.disease_type', 'cases.primary_site']" |
1223 |
| - ] |
1224 |
| - }, |
1225 |
| - "execution_count": 17, |
1226 |
| - "metadata": {}, |
1227 |
| - "output_type": "execute_result" |
1228 |
| - } |
1229 |
| - ], |
1230 |
| - "source": [ |
1231 |
| - "# show columns that start with 'case'\n", |
1232 |
| - "case_columns = [col for col in combined_df.columns if col.startswith('case')]\n", |
1233 |
| - "case_columns" |
| 1208 | + "df.isnull().sum().sort_values(ascending=False)" |
1234 | 1209 | ]
|
1235 | 1210 | },
|
1236 | 1211 | {
|
|
1249 | 1224 | },
|
1250 | 1225 | {
|
1251 | 1226 | "cell_type": "code",
|
1252 |
| - "execution_count": 19, |
| 1227 | + "execution_count": 13, |
1253 | 1228 | "metadata": {
|
1254 | 1229 | "slideshow": {
|
1255 | 1230 | "slide_type": "slide"
|
|
1266 | 1241 | "dtype: int64"
|
1267 | 1242 | ]
|
1268 | 1243 | },
|
1269 |
| - "execution_count": 19, |
| 1244 | + "execution_count": 13, |
1270 | 1245 | "metadata": {},
|
1271 | 1246 | "output_type": "execute_result"
|
1272 | 1247 | }
|
1273 | 1248 | ],
|
1274 | 1249 | "source": [
|
1275 | 1250 | "# show which columns have the value 'Unknown' in them and show how many each column has in descending order\n",
|
1276 |
| - "unknown_values = combined_df.isin(['Unknown']).sum().sort_values(ascending=False)\n", |
| 1251 | + "unknown_values = df.isin(['Unknown']).sum().sort_values(ascending=False)\n", |
1277 | 1252 | "unknown_values[unknown_values > 0]"
|
1278 | 1253 | ]
|
1279 | 1254 | },
|
|
1293 | 1268 | },
|
1294 | 1269 | {
|
1295 | 1270 | "cell_type": "code",
|
1296 |
| - "execution_count": 21, |
| 1271 | + "execution_count": 14, |
1297 | 1272 | "metadata": {},
|
1298 | 1273 | "outputs": [
|
1299 | 1274 | {
|
|
1340 | 1315 | "dtype: int64"
|
1341 | 1316 | ]
|
1342 | 1317 | },
|
1343 |
| - "execution_count": 21, |
| 1318 | + "execution_count": 14, |
1344 | 1319 | "metadata": {},
|
1345 | 1320 | "output_type": "execute_result"
|
1346 | 1321 | }
|
1347 | 1322 | ],
|
1348 | 1323 | "source": [
|
1349 | 1324 | "# show the number of unique values in each column in descending order\n",
|
1350 |
| - "unique_values = combined_df.nunique().sort_values(ascending=False)\n", |
| 1325 | + "unique_values = df.nunique().sort_values(ascending=False)\n", |
1351 | 1326 | "unique_values"
|
1352 | 1327 | ]
|
1353 | 1328 | },
|
|
1360 | 1335 | },
|
1361 | 1336 | {
|
1362 | 1337 | "cell_type": "code",
|
1363 |
| - "execution_count": 22, |
| 1338 | + "execution_count": 15, |
1364 | 1339 | "metadata": {},
|
1365 | 1340 | "outputs": [
|
1366 | 1341 | {
|
|
1400 | 1375 | "# show 5 unique values of columns with unique values less than 100\n",
|
1401 | 1376 | "for col, n_unique in unique_values.items():\n",
|
1402 | 1377 | " if n_unique < 100:\n",
|
1403 |
| - " unique_vals = combined_df[col].unique()\n", |
| 1378 | + " unique_vals = df[col].unique()\n", |
1404 | 1379 | " print(f\"{col}: {unique_vals[:5]}\")"
|
1405 | 1380 | ]
|
1406 | 1381 | },
|
|
1420 | 1395 | },
|
1421 | 1396 | {
|
1422 | 1397 | "cell_type": "code",
|
1423 |
| - "execution_count": 24, |
| 1398 | + "execution_count": 16, |
1424 | 1399 | "metadata": {},
|
1425 | 1400 | "outputs": [
|
1426 | 1401 | {
|
|
1429 | 1404 | "36008"
|
1430 | 1405 | ]
|
1431 | 1406 | },
|
1432 |
| - "execution_count": 24, |
| 1407 | + "execution_count": 16, |
1433 | 1408 | "metadata": {},
|
1434 | 1409 | "output_type": "execute_result"
|
1435 | 1410 | }
|
1436 | 1411 | ],
|
1437 | 1412 | "source": [
|
1438 | 1413 | "# show the number of duplicate records in the dataframe\n",
|
1439 |
| - "n_duplicates = combined_df.duplicated().sum()\n", |
| 1414 | + "n_duplicates = df.duplicated().sum()\n", |
1440 | 1415 | "n_duplicates"
|
1441 | 1416 | ]
|
1442 | 1417 | },
|
|
1451 | 1426 | },
|
1452 | 1427 | {
|
1453 | 1428 | "cell_type": "code",
|
1454 |
| - "execution_count": 28, |
| 1429 | + "execution_count": 17, |
1455 | 1430 | "metadata": {},
|
1456 | 1431 | "outputs": [
|
1457 | 1432 | {
|
@@ -19465,14 +19440,14 @@
|
19465 | 19440 | "Name: count, dtype: int64"
|
19466 | 19441 | ]
|
19467 | 19442 | },
|
19468 |
| - "execution_count": 28, |
| 19443 | + "execution_count": 17, |
19469 | 19444 | "metadata": {},
|
19470 | 19445 | "output_type": "execute_result"
|
19471 | 19446 | }
|
19472 | 19447 | ],
|
19473 | 19448 | "source": [
|
19474 | 19449 | "# count how many records share the same case_id\n",
|
19475 |
| - "case_id_counts = combined_df['case_id'].value_counts()\n", |
| 19450 | + "case_id_counts = df['case_id'].value_counts()\n", |
19476 | 19451 | "case_id_counts"
|
19477 | 19452 | ]
|
19478 | 19453 | },
|
|
19485 | 19460 | },
|
19486 | 19461 | {
|
19487 | 19462 | "cell_type": "code",
|
19488 |
| - "execution_count": 29, |
| 19463 | + "execution_count": 18, |
19489 | 19464 | "metadata": {},
|
19490 | 19465 | "outputs": [
|
19491 | 19466 | {
|
|
19834 | 19809 | "36299 NaN NaN "
|
19835 | 19810 | ]
|
19836 | 19811 | },
|
19837 |
| - "execution_count": 29, |
| 19812 | + "execution_count": 18, |
19838 | 19813 | "metadata": {},
|
19839 | 19814 | "output_type": "execute_result"
|
19840 | 19815 | }
|
|
19843 | 19818 | "# show the records with the case_id 40e57344-a8ad-4de4-92e4-6e681c0593b7\n",
|
19844 | 19819 | "case_id = '40e57344-a8ad-4de4-92e4-6e681c0593b7'\n",
|
19845 | 19820 | "\n",
|
19846 |
| - "combined_df[combined_df['case_id'] == '40e57344-a8ad-4de4-92e4-6e681c0593b7']" |
| 19821 | + "df[df['case_id'] == '40e57344-a8ad-4de4-92e4-6e681c0593b7']" |
19847 | 19822 | ]
|
19848 | 19823 | },
|
19849 | 19824 | {
|
|
0 commit comments