Wei-cyber
diff --git a/Diff for: ‎323-Benfords Law/01a-benford_tax_deductions.py
+67 b/Diff for: ‎323-Benfords Law/01a-benford_tax_deductions.py
+67
diff --git a/Diff for: ‎323-Benfords Law/01b-synthetic_tax_return_csv.py
+31 b/Diff for: ‎323-Benfords Law/01b-synthetic_tax_return_csv.py
+31
diff --git a/Diff for: ‎323-Benfords Law/02-benford_population.py
+55 b/Diff for: ‎323-Benfords Law/02-benford_population.py
+55
diff --git a/Diff for: ‎323-Benfords Law/03-benford_gdp.py
+55 b/Diff for: ‎323-Benfords Law/03-benford_gdp.py
+55
diff --git a/Diff for: ‎323-Benfords Law/04-benford_COVID.py
+85 b/Diff for: ‎323-Benfords Law/04-benford_COVID.py
+85
diff --git a/Diff for: ‎323-Benfords Law/05-benford_images.py
+80 b/Diff for: ‎323-Benfords Law/05-benford_images.py
+80
diff --git a/Diff for: ‎323-Benfords Law/Benfords Law.pptx
446 KB b/Diff for: ‎323-Benfords Law/Benfords Law.pptx
446 KB
@@ -0,0 +1,67 @@
+"""
+Benford's Law, also known as the first-digit law, is a statistical phenomenon 
+observed in many sets of numerical data. It states that in certain naturally 
+occurring datasets, the leading digits (1, 2, 3, etc.) occur with a higher frequency 
+than larger digits (4, 5, 6, etc.). According to Benford's Law, the distribution 
+of leading digits follows a logarithmic pattern, where smaller digits are more 
+likely to be the first digit in a number. This surprising and counterintuitive 
+property is frequently encountered in diverse datasets such as financial transactions,
+population numbers, and scientific data, making Benford's Law a useful tool for 
+detecting anomalies and irregularities in numerical datasets.
+
+In this python code, we analyze the distribution of leading digits in tax deduction 
+amounts, with the objective of verifying whether the data adheres to Benford's Law. 
+
+The observed frequencies of the leading digits are computed and compared against 
+the expected frequencies predicted by Benford's Law. 
+
+
+"""
+
+
+import csv
+import numpy as np
+import matplotlib.pyplot as plt
+import seaborn as sns
+
+# Load tax return data from CSV
+csv_file = "data/synthetic_tax_return_data.csv"
+
+# Initialize a list to store deduction amounts
+deduction_amounts = []
+
+# Read the CSV file and extract deduction amounts
+with open(csv_file, mode='r') as file:
+    reader = csv.reader(file)
+    next(reader)  # Skip the header row
+    for row in reader:
+        deduction = row[4].replace('$', '').replace(',', '').strip()
+        if deduction:
+            deduction_amounts.append(float(deduction))
+
+# Calculate the leading digits for deduction amounts
+leading_digits = [int(str(amount)[0]) for amount in deduction_amounts if amount > 0]
+
+# Calculate observed frequencies (counts of leading digits)
+observed_counts = [leading_digits.count(digit) for digit in range(1, 10)]
+
+# Calculate the expected frequencies according to Benford's Law
+expected_counts = [int(len(leading_digits) * np.log10(1 + 1/digit)) for digit in range(1, 10)]
+
+# Define the digits (1 through 9) for the x-axis
+digits = range(1, 10)
+
+# Create a line plot for observed frequencies
+sns.lineplot(x=digits, y=observed_counts, label='Observed')
+
+# Create a line plot for expected frequencies
+sns.lineplot(x=digits, y=expected_counts, label='Expected')
+
+# Add labels and a legend
+plt.xlabel('Leading Digit')
+plt.ylabel('Frequency')
+plt.title('Leading Digit Distribution for Deductions (Observed vs. Expected)')
+plt.legend()
+
+# Show the plot
+plt.show()
@@ -0,0 +1,31 @@
+import csv
+
+# Define the synthetic tax return data
+tax_return_data = [
+    ["Name", "Income Source", "Income", "Deduction Type", "Deduction Amount"],
+    ["John Doe", "Salary", "$50,000.00", "Mortgage Interest", "$1,200.00"],
+    ["", "Freelance Income", "$7,500.00", "Property Taxes", "$900.00"],
+    ["", "Rental Property 1", "$12,000.00", "State Income Tax", "$2,000.00"],
+    ["", "Investment Dividends", "$2,500.00", "Health Insurance", "$1,200.00"],
+    ["", "Business Income", "$10,000.00", "Charitable Donations", "$500.00"],
+    ["", "", "", "Home Office Expense", "$600.00"],
+    ["", "", "", "Education Expenses", "$1,800.00"],
+    ["", "", "", "Business Travel", "$850.00"],
+    ["", "", "", "Professional Fees", "$1,200.00"],
+    ["", "", "", "Utilities", "$750.00"],
+    ["", "", "", "Transportation", "$1,000.00"],
+    ["", "", "", "Entertainment", "$400.00"],
+    ["", "", "", "Medical Expenses", "$2,500.00"],
+    ["", "", "", "Charitable Donations", "$700.00"],
+    ["", "", "", "Other Deductions", "$1,200.00"]
+]
+
+# Specify the CSV file name
+csv_file = "data/synthetic_tax_return_data.csv"
+
+# Open the CSV file in write mode and write the data
+with open(csv_file, mode='w', newline='') as file:
+    writer = csv.writer(file)
+    writer.writerows(tax_return_data)
+
+print(f"Tax return data has been saved to {csv_file}")
@@ -0,0 +1,55 @@
+"""
+Benford's Law, also known as the first-digit law, is a statistical phenomenon 
+observed in many sets of numerical data. It states that in certain naturally 
+occurring datasets, the leading digits (1, 2, 3, etc.) occur with a higher frequency 
+than larger digits (4, 5, 6, etc.). According to Benford's Law, the distribution 
+of leading digits follows a logarithmic pattern, where smaller digits are more 
+likely to be the first digit in a number. This surprising and counterintuitive 
+property is frequently encountered in diverse datasets such as financial transactions,
+population numbers, and scientific data, making Benford's Law a useful tool for 
+detecting anomalies and irregularities in numerical datasets.
+
+In this python code, we analyze the distribution of leading digits in the population 
+data, with the objective of verifying whether the data adheres to Benford's Law. 
+
+The observed frequencies of the leading digits are computed and compared against 
+the expected frequencies predicted by Benford's Law. 
+
+
+"""
+
+import pandas as pd
+import numpy as np
+import seaborn as sns
+import matplotlib.pyplot as plt
+
+# Load the Population Data from the CSV file
+population_df = pd.read_csv("data/population_data.csv")
+
+# Extract the leading digit for each population and store it in a list
+leading_digits = [int(str(population).replace(',', '')[0]) for population in population_df['Population']]
+
+# Calculate observed frequencies (counts of leading digits)
+observed_counts = [leading_digits.count(digit) for digit in range(1, 10)]
+
+# Calculate the expected frequencies according to Benford's Law
+total_records = len(leading_digits)
+expected_counts = [int(total_records * np.log10(1 + 1/digit)) for digit in range(1, 10)]
+
+# Define the digits (1 through 9) for the x-axis
+digits = range(1, 10)
+
+# Create a line plot for observed frequencies
+sns.lineplot(x=digits, y=observed_counts, label='Observed')
+
+# Create a line plot for expected frequencies
+sns.lineplot(x=digits, y=expected_counts, label='Expected')
+
+# Add labels and a legend
+plt.xlabel('Leading Digit')
+plt.ylabel('Frequency')
+plt.title('Leading Digit Distribution (Population Data)')
+plt.legend()
+
+# Show the plot
+plt.show()
@@ -0,0 +1,55 @@
+"""
+Benford's Law, also known as the first-digit law, is a statistical phenomenon 
+observed in many sets of numerical data. It states that in certain naturally 
+occurring datasets, the leading digits (1, 2, 3, etc.) occur with a higher frequency 
+than larger digits (4, 5, 6, etc.). According to Benford's Law, the distribution 
+of leading digits follows a logarithmic pattern, where smaller digits are more 
+likely to be the first digit in a number. This surprising and counterintuitive 
+property is frequently encountered in diverse datasets such as financial transactions,
+population numbers, and scientific data, making Benford's Law a useful tool for 
+detecting anomalies and irregularities in numerical datasets.
+
+In this python code, we analyze the distribution of leading digits in national GDP 
+amounts, with the objective of verifying whether the data adheres to Benford's Law. 
+
+The observed frequencies of the leading digits are computed and compared against 
+the expected frequencies predicted by Benford's Law. 
+
+
+"""
+
+import pandas as pd
+import numpy as np
+import seaborn as sns
+import matplotlib.pyplot as plt
+
+# Load the GDP Data from the CSV file
+gdp_df = pd.read_csv("data/gdp_data.csv")
+
+# Extract the leading digit for each GDP value and store it in a list
+leading_digits = [int(str(gdp).replace(',', '')[0]) for gdp in gdp_df['GDP (nominal, 2022)']]
+
+# Calculate observed frequencies (counts of leading digits)
+observed_counts = [leading_digits.count(digit) for digit in range(1, 10)]
+
+# Calculate the expected frequencies according to Benford's Law
+total_records = len(leading_digits)
+expected_counts = [int(total_records * np.log10(1 + 1/digit)) for digit in range(1, 10)]
+
+# Define the digits (1 through 9) for the x-axis
+digits = range(1, 10)
+
+# Create a line plot for observed frequencies
+sns.lineplot(x=digits, y=observed_counts, label='Observed')
+
+# Create a line plot for expected frequencies
+sns.lineplot(x=digits, y=expected_counts, label='Expected')
+
+# Add labels and a legend
+plt.xlabel('Leading Digit')
+plt.ylabel('Frequency')
+plt.title('Leading Digit Distribution (GDP Data)')
+plt.legend()
+
+# Show the plot
+plt.show()
@@ -0,0 +1,85 @@
+"""
+Benford's Law, also known as the first-digit law, is a statistical phenomenon 
+observed in many sets of numerical data. It states that in certain naturally 
+occurring datasets, the leading digits (1, 2, 3, etc.) occur with a higher frequency 
+than larger digits (4, 5, 6, etc.). According to Benford's Law, the distribution 
+of leading digits follows a logarithmic pattern, where smaller digits are more 
+likely to be the first digit in a number. This surprising and counterintuitive 
+property is frequently encountered in diverse datasets such as financial transactions,
+population numbers, and scientific data, making Benford's Law a useful tool for 
+detecting anomalies and irregularities in numerical datasets.
+
+In this python code, we analyze the distribution of leading digits in COVID cases 
+and deaths data, with the objective of verifying whether the data adheres to Benford's Law. 
+
+The observed frequencies of the leading digits are computed and compared against 
+the expected frequencies predicted by Benford's Law. 
+
+
+"""
+import pandas as pd
+import numpy as np
+import seaborn as sns
+import matplotlib.pyplot as plt
+
+# Load the COVID Data from the CSV file
+covid_df = pd.read_csv("data/covid_data.csv")
+
+# Verify Benford's Law for Total Cases
+total_cases_digits = [
+    int(str(cases).replace(",", "")[0]) for cases in covid_df["Total Cases"]
+]
+observed_counts_total_cases = [
+    total_cases_digits.count(digit) for digit in range(1, 10)
+]
+total_records_total_cases = len(total_cases_digits)
+expected_counts_total_cases = [
+    int(total_records_total_cases * np.log10(1 + 1 / digit)) for digit in range(1, 10)
+]
+
+# Verify Benford's Law for Total Deaths, handling 'nan' values
+total_deaths_digits = [
+    int(str(deaths).replace(",", "")[0])
+    if str(deaths).replace(",", "")[0] != "n"
+    else None
+    for deaths in covid_df["Total Deaths"]
+]
+observed_counts_total_deaths = [
+    total_deaths_digits.count(digit) for digit in range(1, 10)
+]
+total_records_total_deaths = len([d for d in total_deaths_digits if d is not None])
+expected_counts_total_deaths = [
+    int(total_records_total_deaths * np.log10(1 + 1 / digit)) for digit in range(1, 10)
+]
+
+# Define the digits (1 through 9) for the x-axis
+digits = range(1, 10)
+
+# Create a line plot for observed frequencies (Total Cases)
+sns.lineplot(x=digits, y=observed_counts_total_cases, label="Observed (Total Cases)")
+
+# Create a line plot for expected frequencies (Total Cases)
+sns.lineplot(x=digits, y=expected_counts_total_cases, label="Expected (Total Cases)")
+# Add labels and a legend
+plt.xlabel("Leading Digit")
+plt.ylabel("Frequency")
+plt.title("Leading Digit Distribution (COVID Data)")
+plt.legend()
+
+# Show the plot
+plt.show()
+
+
+# Create a line plot for observed frequencies (Total Deaths)
+sns.lineplot(x=digits, y=observed_counts_total_deaths, label="Observed (Total Deaths)")
+# Create a line plot for expected frequencies (Total Deaths)
+sns.lineplot(x=digits, y=expected_counts_total_deaths, label="Expected (Total Deaths)")
+
+# Add labels and a legend
+plt.xlabel("Leading Digit")
+plt.ylabel("Frequency")
+plt.title("Leading Digit Distribution (COVID Data)")
+plt.legend()
+
+# Show the plot
+plt.show()
@@ -0,0 +1,80 @@
+"""
+Verification of Benford's law in image data.
+
+This code analyzes the leading digit distribution of Discrete Cosine Transform (DCT) 
+coefficients in grayscale images. It is about verifying the Benford's Law to highlight 
+the frequencies of leading digits (1 through 9) in the pixel values derived 
+from DCT. 
+
+The code reads images in grayscale using opencv library, computes the DCT coefficients, 
+and plots the observed Benford's Law distribution for each image. 
+Then we plot the data to visualize and compare the occurrence of leading digits 
+in different images. 
+
+In case you wonder why go through the pain of converting pixel values to DCT...
+
+In the context of Benford's Law, the distribution of leading digits is expected 
+to follow a logarithmic pattern, where smaller digits (1, 2, 3) occur more 
+frequently than larger digits (4, 5, 6, 7, 8, 9). 
+When pixel values are confined to a small range, it can disrupt this natural 
+logarithmic distribution. For example, in 8 bit images, our pixels have values between
+0 to 255. So any bright pixel will always have a leading digit of 2 and never
+have values 3 or greater. 
+
+"""
+
+
+import cv2
+import numpy as np
+import matplotlib.pyplot as plt
+import seaborn as sns
+
+def leading_digit(pixel_value):
+    """
+    Helper function to extract the leading digit of a pixel value.
+    """
+    while pixel_value >= 10:
+        pixel_value //= 10
+    return pixel_value
+
+def calculate_observed_counts(coefficients):
+    """
+    Calculate the observed leading digit frequencies from the given coefficients.
+    """
+    leading_digits = [leading_digit(abs(coeff)) for coeff in coefficients]
+    observed_counts = [leading_digits.count(digit) for digit in range(1, 10)]
+    return observed_counts
+
+def plot_observed_benford_law(image_paths, labels):
+    """
+    Plot the observed Benford's Law distribution for a list of images.
+
+    Parameters:
+    - image_paths (list): List of paths to image files.
+    - labels (list): List of labels for each image.
+
+    """
+    plt.figure(figsize=(10, 6))
+
+    # Define custom colors for observed lines
+    observed_colors = ['darkred', 'darkgreen', 'darkblue']
+
+    for i, image_path in enumerate(image_paths):
+        image = cv2.imread(image_path, cv2.IMREAD_GRAYSCALE)
+        dct_coefficients = cv2.dct(np.float32(image))
+        selected_coefficients = dct_coefficients.flatten()
+
+        observed_counts = calculate_observed_counts(selected_coefficients)
+
+        sns.lineplot(x=range(1, 10), y=observed_counts, label=f'{labels[i]} - Observed', color=observed_colors[i])
+
+    plt.xlabel('Leading Digit')
+    plt.ylabel('Frequency')
+    plt.title('Leading Digit Distribution (Observed)')
+    plt.legend()
+    plt.show()
+
+# Example usage:
+image_paths = ['data/sat_img1.jpg', 'data/sat_img2.jpg', 'data/sat_img3.jpg']
+labels = ['Image 1', 'Image 2', 'Image 3']
+plot_observed_benford_law(image_paths, labels)