Skip to content

Commit a99c0f6

Browse files
authored
Merge pull request #3 from timmanik/main
Added completed notebooks and a folder for troubleshooting guides
2 parents 31f8950 + 096ee05 commit a99c0f6

10 files changed

+130842
-48
lines changed

01-cancer-data-analysis/completed-notebooks/combined_data.csv

+72,017
Large diffs are not rendered by default.

01-cancer-data-analysis/completed-notebooks/combined_data_cleaned.csv

+18,004
Large diffs are not rendered by default.

01-cancer-data-analysis/completed-notebooks/fm-ad-notebook-exploration-COMPLETED.ipynb

+19,853
Large diffs are not rendered by default.

01-cancer-data-analysis/completed-notebooks/fm-ad-notebook-processing-COMPLETED.ipynb

+20,170
Large diffs are not rendered by default.

01-cancer-data-analysis/completed-notebooks/fm-ad-notebook-visualization-COMPLETED.ipynb

+653
Large diffs are not rendered by default.

01-cancer-data-analysis/fm-ad-notebook-exploration.ipynb

+23-8
Original file line numberDiff line numberDiff line change
@@ -191,10 +191,10 @@
191191
" dataframes.append(df)\n",
192192
"\n",
193193
"# Concatenate all the dataframes in the list into a single dataframe\n",
194-
"combined_df = pd.concat(dataframes, ignore_index=True)\n",
194+
"df = pd.concat(dataframes, ignore_index=True)\n",
195195
"\n",
196196
"# Save dataframe to a CSV file\n",
197-
"combined_df.to_csv('combined_data.csv', index=False)"
197+
"df.to_csv('combined_data.csv', index=False)"
198198
]
199199
},
200200
{
@@ -371,7 +371,9 @@
371371
},
372372
"outputs": [],
373373
"source": [
374-
"# show which columns have the value 'Unknown' in them and show how many each column has in descending order"
374+
"# show which columns have the value 'Unknown' in them and show how many each column has in descending order\n",
375+
"unknown_values = df.isin(['Unknown']).sum().sort_values(ascending=False)\n",
376+
"unknown_values[unknown_values > 0]"
375377
]
376378
},
377379
{
@@ -394,7 +396,9 @@
394396
"metadata": {},
395397
"outputs": [],
396398
"source": [
397-
"# show the number of unique values in each column in descending order"
399+
"# show the number of unique values in each column in descending order\n",
400+
"unique_values = df.nunique().sort_values(ascending=False)\n",
401+
"unique_values"
398402
]
399403
},
400404
{
@@ -410,7 +414,11 @@
410414
"metadata": {},
411415
"outputs": [],
412416
"source": [
413-
"# show 5 unique values of columns with unique values less than 100"
417+
"# show 5 unique values of columns with unique values less than 100\n",
418+
"for col, n_unique in unique_values.items():\n",
419+
" if n_unique < 100:\n",
420+
" unique_vals = df[col].unique()\n",
421+
" print(f\"{col}: {unique_vals[:5]}\")"
414422
]
415423
},
416424
{
@@ -433,7 +441,9 @@
433441
"metadata": {},
434442
"outputs": [],
435443
"source": [
436-
"# show the number of duplicate records in the dataframe"
444+
"# show the number of duplicate records in the dataframe\n",
445+
"n_duplicates = df.duplicated().sum()\n",
446+
"n_duplicates"
437447
]
438448
},
439449
{
@@ -451,7 +461,9 @@
451461
"metadata": {},
452462
"outputs": [],
453463
"source": [
454-
"# count how many records share the same case_id"
464+
"# count how many records share the same case_id\n",
465+
"case_id_counts = df['case_id'].value_counts()\n",
466+
"case_id_counts"
455467
]
456468
},
457469
{
@@ -467,7 +479,10 @@
467479
"metadata": {},
468480
"outputs": [],
469481
"source": [
470-
"# show the records with the case_id 40e57344-a8ad-4de4-92e4-6e681c0593b7"
482+
"# show the records with the case_id 40e57344-a8ad-4de4-92e4-6e681c0593b7\n",
483+
"case_id = '40e57344-a8ad-4de4-92e4-6e681c0593b7'\n",
484+
"\n",
485+
"df[df['case_id'] == '40e57344-a8ad-4de4-92e4-6e681c0593b7']"
471486
]
472487
},
473488
{

01-cancer-data-analysis/fm-ad-notebook-processing.ipynb

+49-21
Original file line numberDiff line numberDiff line change
@@ -44,8 +44,8 @@
4444
"metadata": {},
4545
"outputs": [],
4646
"source": [
47-
"# convert the combined_data.csv to dataframe called combined_df\n",
48-
"combined_df = pd.read_csv('combined_data.csv')"
47+
"# convert the combined_data.csv to dataframe called df\n",
48+
"df = pd.read_csv('combined_data.csv')"
4949
]
5050
},
5151
{
@@ -134,15 +134,25 @@
134134
"execution_count": null,
135135
"metadata": {},
136136
"outputs": [],
137-
"source": []
137+
"source": [
138+
"# show the dictionary"
139+
]
140+
},
141+
{
142+
"cell_type": "code",
143+
"execution_count": null,
144+
"metadata": {},
145+
"outputs": [],
146+
"source": [
147+
"# show if case_id is in the columns"
148+
]
138149
},
139150
{
140151
"cell_type": "code",
141152
"execution_count": null,
142153
"metadata": {},
143154
"outputs": [],
144155
"source": [
145-
"# create a copy of the current dataframe\n",
146156
"# drop columns from the dictionary above"
147157
]
148158
},
@@ -184,7 +194,7 @@
184194
"metadata": {},
185195
"outputs": [],
186196
"source": [
187-
"# change values 'Unknown' to NaN in the dataframe using numpy and create a new dataframe"
197+
"# change values 'Unknown' to NaN in the dataframe using numpy"
188198
]
189199
},
190200
{
@@ -223,7 +233,7 @@
223233
"metadata": {},
224234
"outputs": [],
225235
"source": [
226-
"# drop duplicate records in the dataframe and create a new dataframe\n"
236+
"# drop duplicate records in the dataframe\n"
227237
]
228238
},
229239
{
@@ -391,7 +401,11 @@
391401
"metadata": {},
392402
"outputs": [],
393403
"source": [
394-
"# Check if all the values in the dictionary are True if so print \"All records complement each other.\" otherwise print \"Not all records complement each other.\""
404+
"# Check if all the values in the dictionary are True if so print \"All records complement each other.\" otherwise print \"Not all records complement each other.\"\n",
405+
"if all(case_id_dict.values()):\n",
406+
" print(\"All records complement each other.\")\n",
407+
"else:\n",
408+
" print(\"Not all records complement each other.\")"
395409
]
396410
},
397411
{
@@ -407,7 +421,8 @@
407421
"metadata": {},
408422
"outputs": [],
409423
"source": [
410-
"# Combine records with the same 'case_id' and take the first non-null value for each group. Then create a new dataframe."
424+
"# Combine records with the same 'case_id' and take the first non-null value for each group\n",
425+
"df = df.groupby('case_id').first().reset_index()"
411426
]
412427
},
413428
{
@@ -423,7 +438,8 @@
423438
"metadata": {},
424439
"outputs": [],
425440
"source": [
426-
"# show the shape of the new dataframe"
441+
"# show the shape of the dataframe\n",
442+
"df.shape"
427443
]
428444
},
429445
{
@@ -439,7 +455,8 @@
439455
"metadata": {},
440456
"outputs": [],
441457
"source": [
442-
"# show the number of duplicate records in the new dataframe"
458+
"# show the number of duplicate records in the dataframe\n",
459+
"df.duplicated().sum()"
443460
]
444461
},
445462
{
@@ -455,7 +472,8 @@
455472
"metadata": {},
456473
"outputs": [],
457474
"source": [
458-
"# show number of unique values in each column in descending order"
475+
"# show number of unique values in each column in descending order\n",
476+
"df.nunique().sort_values(ascending=False)"
459477
]
460478
},
461479
{
@@ -471,7 +489,8 @@
471489
"metadata": {},
472490
"outputs": [],
473491
"source": [
474-
"# check to see if there are any null values in the dataframe"
492+
"# check to see if there are any null values in the dataframe\n",
493+
"df.isnull().sum().sum()"
475494
]
476495
},
477496
{
@@ -487,7 +506,8 @@
487506
"metadata": {},
488507
"outputs": [],
489508
"source": [
490-
"# show the number unique values of the columns that have null values"
509+
"# show the number unique values of the columns that have null values\n",
510+
"df.isnull().sum()[df.isnull().sum() > 0]"
491511
]
492512
},
493513
{
@@ -510,7 +530,8 @@
510530
"metadata": {},
511531
"outputs": [],
512532
"source": [
513-
"# describe stats on diagnoses.age_at_diagnosis column"
533+
"# describe stats on diagnoses.age_at_diagnosis column\n",
534+
"df['diagnoses.age_at_diagnosis'].describe()"
514535
]
515536
},
516537
{
@@ -542,7 +563,8 @@
542563
"metadata": {},
543564
"outputs": [],
544565
"source": [
545-
"# create a new dataframe, create a new column 'diagnoses.age_at_diagnosis_years' by dividing 'diagnoses.age_at_diagnosis' by 365, and drop the 'diagonses.age_at_diagnosis' column"
566+
"# create a new column 'diagnoses.age_at_diagnosis_years' by dividing 'diagnoses.age_at_diagnosis' by 365, and drop the 'diagonses.age_at_diagnosis' column\n",
567+
"df['diagnoses.age_at_diagnosis_years'] = df['diagnoses.age_at_diagnosis'] / 365"
546568
]
547569
},
548570
{
@@ -558,7 +580,8 @@
558580
"metadata": {},
559581
"outputs": [],
560582
"source": [
561-
"# count how many records that have the value of 'diagnosis.age_at_diagnosis_years' greater or equal to 89"
583+
"# count how many records that have the value of 'diagnosis.age_at_diagnosis_years' greater or equal to 89\n",
584+
"(df['diagnoses.age_at_diagnosis_years'] >= 89).sum()"
562585
]
563586
},
564587
{
@@ -567,7 +590,8 @@
567590
"metadata": {},
568591
"outputs": [],
569592
"source": [
570-
"# drop the record with 'diagnosis.age_at_diagnosis_years' greater or equal to 89"
593+
"# drop the record with 'diagnosis.age_at_diagnosis_years' greater or equal to 89\n",
594+
"df = df[df['diagnoses.age_at_diagnosis_years'] < 89]"
571595
]
572596
},
573597
{
@@ -583,7 +607,8 @@
583607
"metadata": {},
584608
"outputs": [],
585609
"source": [
586-
"# round down the diagnoses.age_at_diagnosis_years column and convert to integer"
610+
"# round down the diagnoses.age_at_diagnosis_years column and convert to integer\n",
611+
"df['diagnoses.age_at_diagnosis_years'] = df['diagnoses.age_at_diagnosis_years'].apply(np.floor).astype(int)"
587612
]
588613
},
589614
{
@@ -599,7 +624,8 @@
599624
"metadata": {},
600625
"outputs": [],
601626
"source": [
602-
"# show statistical summary of the diagnoses.age_at_diagnosis_years column"
627+
"# show statistical summary of the diagnoses.age_at_diagnosis_years column\n",
628+
"df['diagnoses.age_at_diagnosis_years'].describe()"
603629
]
604630
},
605631
{
@@ -615,7 +641,8 @@
615641
"metadata": {},
616642
"outputs": [],
617643
"source": [
618-
"# drop diagnosis.age_at_diagnosis column"
644+
"# drop diagnosis.age_at_diagnosis column\n",
645+
"df.drop(columns=['diagnoses.age_at_diagnosis'], inplace=True)"
619646
]
620647
},
621648
{
@@ -638,7 +665,8 @@
638665
"metadata": {},
639666
"outputs": [],
640667
"source": [
641-
"# Save dataframe to a new csv file named combined_data_cleaned.csv"
668+
"# Save dataframe to a new csv file named combined_data_cleaned.csv\n",
669+
"df.to_csv('combined_data_cleaned.csv', index=False)"
642670
]
643671
}
644672
],

01-cancer-data-analysis/fm-ad-notebook-visualization.ipynb

+8-17
Original file line numberDiff line numberDiff line change
@@ -53,23 +53,7 @@
5353
"metadata": {},
5454
"outputs": [],
5555
"source": [
56-
"combined_data_cleansed_df = pd.read_csv('combined_data_cleaned.csv')"
57-
]
58-
},
59-
{
60-
"cell_type": "markdown",
61-
"metadata": {},
62-
"source": [
63-
"Let's rename our dataframe to df so that it will be easier to use the code suggestions from GitHub Copilot chat."
64-
]
65-
},
66-
{
67-
"cell_type": "code",
68-
"execution_count": null,
69-
"metadata": {},
70-
"outputs": [],
71-
"source": [
72-
"df = combined_data_cleansed_df"
56+
"df = pd.read_csv('combined_data_cleaned.csv')"
7357
]
7458
},
7559
{
@@ -112,6 +96,13 @@
11296
"# show first few records"
11397
]
11498
},
99+
{
100+
"cell_type": "markdown",
101+
"metadata": {},
102+
"source": [
103+
"You should expect to see the dimension (18003, 24)."
104+
]
105+
},
115106
{
116107
"cell_type": "code",
117108
"execution_count": null,

02-custom-data-analysis/cyber-defense-notebook.ipynb

+13-2
Original file line numberDiff line numberDiff line change
@@ -93,9 +93,20 @@
9393
"client = boto3.client('s3', config=Config(signature_version=UNSIGNED))\n",
9494
"cyber_bucket = 'cse-cic-ids2018'\n",
9595
"cyber_prefix = f'Processed Traffic Data for ML Algorithms'\n",
96+
"file_name = 'Thursday-15-02-2018_TrafficForML_CICFlowMeter.csv'\n",
9697
"\n",
97-
"obj = client.get_object(Bucket= cyber_bucket , Key = cyber_prefix + '/' + 'Thursday-15-02-2018_TrafficForML_CICFlowMeter.csv')\n",
98-
"df = pd.read_csv(io.BytesIO(obj['Body'].read()), encoding='utf8')"
98+
"obj = client.get_object(Bucket= cyber_bucket , Key = cyber_prefix + '/' + file_name)\n",
99+
"df = pd.read_csv(io.BytesIO(obj['Body'].read()), encoding='utf8')\n",
100+
"df.to_csv(file_name, index=False)\n"
101+
]
102+
},
103+
{
104+
"cell_type": "code",
105+
"execution_count": null,
106+
"metadata": {},
107+
"outputs": [],
108+
"source": [
109+
"df = pd.read_csv(file_name)"
99110
]
100111
},
101112
{

0 commit comments

Comments
 (0)