|
44 | 44 | "metadata": {},
|
45 | 45 | "outputs": [],
|
46 | 46 | "source": [
|
47 |
| - "# convert the combined_data.csv to dataframe called combined_df\n", |
48 |
| - "combined_df = pd.read_csv('combined_data.csv')" |
| 47 | + "# convert the combined_data.csv to dataframe called df\n", |
| 48 | + "df = pd.read_csv('combined_data.csv')" |
49 | 49 | ]
|
50 | 50 | },
|
51 | 51 | {
|
|
134 | 134 | "execution_count": null,
|
135 | 135 | "metadata": {},
|
136 | 136 | "outputs": [],
|
137 |
| - "source": [] |
| 137 | + "source": [ |
| 138 | + "# show the dictionary" |
| 139 | + ] |
| 140 | + }, |
| 141 | + { |
| 142 | + "cell_type": "code", |
| 143 | + "execution_count": null, |
| 144 | + "metadata": {}, |
| 145 | + "outputs": [], |
| 146 | + "source": [ |
| 147 | + "# show if case_id is in the columns" |
| 148 | + ] |
138 | 149 | },
|
139 | 150 | {
|
140 | 151 | "cell_type": "code",
|
141 | 152 | "execution_count": null,
|
142 | 153 | "metadata": {},
|
143 | 154 | "outputs": [],
|
144 | 155 | "source": [
|
145 |
| - "# create a copy of the current dataframe\n", |
146 | 156 | "# drop columns from the dictionary above"
|
147 | 157 | ]
|
148 | 158 | },
|
|
184 | 194 | "metadata": {},
|
185 | 195 | "outputs": [],
|
186 | 196 | "source": [
|
187 |
| - "# change values 'Unknown' to NaN in the dataframe using numpy and create a new dataframe" |
| 197 | + "# change values 'Unknown' to NaN in the dataframe using numpy" |
188 | 198 | ]
|
189 | 199 | },
|
190 | 200 | {
|
|
223 | 233 | "metadata": {},
|
224 | 234 | "outputs": [],
|
225 | 235 | "source": [
|
226 |
| - "# drop duplicate records in the dataframe and create a new dataframe\n" |
| 236 | + "# drop duplicate records in the dataframe\n" |
227 | 237 | ]
|
228 | 238 | },
|
229 | 239 | {
|
|
391 | 401 | "metadata": {},
|
392 | 402 | "outputs": [],
|
393 | 403 | "source": [
|
394 |
| - "# Check if all the values in the dictionary are True if so print \"All records complement each other.\" otherwise print \"Not all records complement each other.\"" |
| 404 | + "# Check if all the values in the dictionary are True if so print \"All records complement each other.\" otherwise print \"Not all records complement each other.\"\n", |
| 405 | + "if all(case_id_dict.values()):\n", |
| 406 | + " print(\"All records complement each other.\")\n", |
| 407 | + "else:\n", |
| 408 | + " print(\"Not all records complement each other.\")" |
395 | 409 | ]
|
396 | 410 | },
|
397 | 411 | {
|
|
407 | 421 | "metadata": {},
|
408 | 422 | "outputs": [],
|
409 | 423 | "source": [
|
410 |
| - "# Combine records with the same 'case_id' and take the first non-null value for each group. Then create a new dataframe." |
| 424 | + "# Combine records with the same 'case_id' and take the first non-null value for each group\n", |
| 425 | + "df = df.groupby('case_id').first().reset_index()" |
411 | 426 | ]
|
412 | 427 | },
|
413 | 428 | {
|
|
423 | 438 | "metadata": {},
|
424 | 439 | "outputs": [],
|
425 | 440 | "source": [
|
426 |
| - "# show the shape of the new dataframe" |
| 441 | + "# show the shape of the dataframe\n", |
| 442 | + "df.shape" |
427 | 443 | ]
|
428 | 444 | },
|
429 | 445 | {
|
|
439 | 455 | "metadata": {},
|
440 | 456 | "outputs": [],
|
441 | 457 | "source": [
|
442 |
| - "# show the number of duplicate records in the new dataframe" |
| 458 | + "# show the number of duplicate records in the dataframe\n", |
| 459 | + "df.duplicated().sum()" |
443 | 460 | ]
|
444 | 461 | },
|
445 | 462 | {
|
|
455 | 472 | "metadata": {},
|
456 | 473 | "outputs": [],
|
457 | 474 | "source": [
|
458 |
| - "# show number of unique values in each column in descending order" |
| 475 | + "# show number of unique values in each column in descending order\n", |
| 476 | + "df.nunique().sort_values(ascending=False)" |
459 | 477 | ]
|
460 | 478 | },
|
461 | 479 | {
|
|
471 | 489 | "metadata": {},
|
472 | 490 | "outputs": [],
|
473 | 491 | "source": [
|
474 |
| - "# check to see if there are any null values in the dataframe" |
| 492 | + "# check to see if there are any null values in the dataframe\n", |
| 493 | + "df.isnull().sum().sum()" |
475 | 494 | ]
|
476 | 495 | },
|
477 | 496 | {
|
|
487 | 506 | "metadata": {},
|
488 | 507 | "outputs": [],
|
489 | 508 | "source": [
|
490 |
| - "# show the number unique values of the columns that have null values" |
| 509 | + "# show the number unique values of the columns that have null values\n", |
| 510 | + "df.isnull().sum()[df.isnull().sum() > 0]" |
491 | 511 | ]
|
492 | 512 | },
|
493 | 513 | {
|
|
510 | 530 | "metadata": {},
|
511 | 531 | "outputs": [],
|
512 | 532 | "source": [
|
513 |
| - "# describe stats on diagnoses.age_at_diagnosis column" |
| 533 | + "# describe stats on diagnoses.age_at_diagnosis column\n", |
| 534 | + "df['diagnoses.age_at_diagnosis'].describe()" |
514 | 535 | ]
|
515 | 536 | },
|
516 | 537 | {
|
|
542 | 563 | "metadata": {},
|
543 | 564 | "outputs": [],
|
544 | 565 | "source": [
|
545 |
| - "# create a new dataframe, create a new column 'diagnoses.age_at_diagnosis_years' by dividing 'diagnoses.age_at_diagnosis' by 365, and drop the 'diagonses.age_at_diagnosis' column" |
| 566 | + "# create a new column 'diagnoses.age_at_diagnosis_years' by dividing 'diagnoses.age_at_diagnosis' by 365, and drop the 'diagonses.age_at_diagnosis' column\n", |
| 567 | + "df['diagnoses.age_at_diagnosis_years'] = df['diagnoses.age_at_diagnosis'] / 365" |
546 | 568 | ]
|
547 | 569 | },
|
548 | 570 | {
|
|
558 | 580 | "metadata": {},
|
559 | 581 | "outputs": [],
|
560 | 582 | "source": [
|
561 |
| - "# count how many records that have the value of 'diagnosis.age_at_diagnosis_years' greater or equal to 89" |
| 583 | + "# count how many records that have the value of 'diagnosis.age_at_diagnosis_years' greater or equal to 89\n", |
| 584 | + "(df['diagnoses.age_at_diagnosis_years'] >= 89).sum()" |
562 | 585 | ]
|
563 | 586 | },
|
564 | 587 | {
|
|
567 | 590 | "metadata": {},
|
568 | 591 | "outputs": [],
|
569 | 592 | "source": [
|
570 |
| - "# drop the record with 'diagnosis.age_at_diagnosis_years' greater or equal to 89" |
| 593 | + "# drop the record with 'diagnosis.age_at_diagnosis_years' greater or equal to 89\n", |
| 594 | + "df = df[df['diagnoses.age_at_diagnosis_years'] < 89]" |
571 | 595 | ]
|
572 | 596 | },
|
573 | 597 | {
|
|
583 | 607 | "metadata": {},
|
584 | 608 | "outputs": [],
|
585 | 609 | "source": [
|
586 |
| - "# round down the diagnoses.age_at_diagnosis_years column and convert to integer" |
| 610 | + "# round down the diagnoses.age_at_diagnosis_years column and convert to integer\n", |
| 611 | + "df['diagnoses.age_at_diagnosis_years'] = df['diagnoses.age_at_diagnosis_years'].apply(np.floor).astype(int)" |
587 | 612 | ]
|
588 | 613 | },
|
589 | 614 | {
|
|
599 | 624 | "metadata": {},
|
600 | 625 | "outputs": [],
|
601 | 626 | "source": [
|
602 |
| - "# show statistical summary of the diagnoses.age_at_diagnosis_years column" |
| 627 | + "# show statistical summary of the diagnoses.age_at_diagnosis_years column\n", |
| 628 | + "df['diagnoses.age_at_diagnosis_years'].describe()" |
603 | 629 | ]
|
604 | 630 | },
|
605 | 631 | {
|
|
615 | 641 | "metadata": {},
|
616 | 642 | "outputs": [],
|
617 | 643 | "source": [
|
618 |
| - "# drop diagnosis.age_at_diagnosis column" |
| 644 | + "# drop diagnosis.age_at_diagnosis column\n", |
| 645 | + "df.drop(columns=['diagnoses.age_at_diagnosis'], inplace=True)" |
619 | 646 | ]
|
620 | 647 | },
|
621 | 648 | {
|
|
638 | 665 | "metadata": {},
|
639 | 666 | "outputs": [],
|
640 | 667 | "source": [
|
641 |
| - "# Save dataframe to a new csv file named combined_data_cleaned.csv" |
| 668 | + "# Save dataframe to a new csv file named combined_data_cleaned.csv\n", |
| 669 | + "df.to_csv('combined_data_cleaned.csv', index=False)" |
642 | 670 | ]
|
643 | 671 | }
|
644 | 672 | ],
|
|
0 commit comments