Skip to content

Commit 71c96bd

Browse files
authored
Add files via upload
1 parent 37f9ddd commit 71c96bd

17 files changed

+1194
-0
lines changed

Diff for: 323-Benfords Law/01a-benford_tax_deductions.py

+67
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,67 @@
1+
"""
2+
Benford's Law, also known as the first-digit law, is a statistical phenomenon
3+
observed in many sets of numerical data. It states that in certain naturally
4+
occurring datasets, the leading digits (1, 2, 3, etc.) occur with a higher frequency
5+
than larger digits (4, 5, 6, etc.). According to Benford's Law, the distribution
6+
of leading digits follows a logarithmic pattern, where smaller digits are more
7+
likely to be the first digit in a number. This surprising and counterintuitive
8+
property is frequently encountered in diverse datasets such as financial transactions,
9+
population numbers, and scientific data, making Benford's Law a useful tool for
10+
detecting anomalies and irregularities in numerical datasets.
11+
12+
In this python code, we analyze the distribution of leading digits in tax deduction
13+
amounts, with the objective of verifying whether the data adheres to Benford's Law.
14+
15+
The observed frequencies of the leading digits are computed and compared against
16+
the expected frequencies predicted by Benford's Law.
17+
18+
19+
"""
20+
21+
22+
import csv
23+
import numpy as np
24+
import matplotlib.pyplot as plt
25+
import seaborn as sns
26+
27+
# Load tax return data from CSV
28+
csv_file = "data/synthetic_tax_return_data.csv"
29+
30+
# Initialize a list to store deduction amounts
31+
deduction_amounts = []
32+
33+
# Read the CSV file and extract deduction amounts
34+
with open(csv_file, mode='r') as file:
35+
reader = csv.reader(file)
36+
next(reader) # Skip the header row
37+
for row in reader:
38+
deduction = row[4].replace('$', '').replace(',', '').strip()
39+
if deduction:
40+
deduction_amounts.append(float(deduction))
41+
42+
# Calculate the leading digits for deduction amounts
43+
leading_digits = [int(str(amount)[0]) for amount in deduction_amounts if amount > 0]
44+
45+
# Calculate observed frequencies (counts of leading digits)
46+
observed_counts = [leading_digits.count(digit) for digit in range(1, 10)]
47+
48+
# Calculate the expected frequencies according to Benford's Law
49+
expected_counts = [int(len(leading_digits) * np.log10(1 + 1/digit)) for digit in range(1, 10)]
50+
51+
# Define the digits (1 through 9) for the x-axis
52+
digits = range(1, 10)
53+
54+
# Create a line plot for observed frequencies
55+
sns.lineplot(x=digits, y=observed_counts, label='Observed')
56+
57+
# Create a line plot for expected frequencies
58+
sns.lineplot(x=digits, y=expected_counts, label='Expected')
59+
60+
# Add labels and a legend
61+
plt.xlabel('Leading Digit')
62+
plt.ylabel('Frequency')
63+
plt.title('Leading Digit Distribution for Deductions (Observed vs. Expected)')
64+
plt.legend()
65+
66+
# Show the plot
67+
plt.show()

Diff for: 323-Benfords Law/01b-synthetic_tax_return_csv.py

+31
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,31 @@
1+
import csv
2+
3+
# Define the synthetic tax return data
4+
tax_return_data = [
5+
["Name", "Income Source", "Income", "Deduction Type", "Deduction Amount"],
6+
["John Doe", "Salary", "$50,000.00", "Mortgage Interest", "$1,200.00"],
7+
["", "Freelance Income", "$7,500.00", "Property Taxes", "$900.00"],
8+
["", "Rental Property 1", "$12,000.00", "State Income Tax", "$2,000.00"],
9+
["", "Investment Dividends", "$2,500.00", "Health Insurance", "$1,200.00"],
10+
["", "Business Income", "$10,000.00", "Charitable Donations", "$500.00"],
11+
["", "", "", "Home Office Expense", "$600.00"],
12+
["", "", "", "Education Expenses", "$1,800.00"],
13+
["", "", "", "Business Travel", "$850.00"],
14+
["", "", "", "Professional Fees", "$1,200.00"],
15+
["", "", "", "Utilities", "$750.00"],
16+
["", "", "", "Transportation", "$1,000.00"],
17+
["", "", "", "Entertainment", "$400.00"],
18+
["", "", "", "Medical Expenses", "$2,500.00"],
19+
["", "", "", "Charitable Donations", "$700.00"],
20+
["", "", "", "Other Deductions", "$1,200.00"]
21+
]
22+
23+
# Specify the CSV file name
24+
csv_file = "data/synthetic_tax_return_data.csv"
25+
26+
# Open the CSV file in write mode and write the data
27+
with open(csv_file, mode='w', newline='') as file:
28+
writer = csv.writer(file)
29+
writer.writerows(tax_return_data)
30+
31+
print(f"Tax return data has been saved to {csv_file}")

Diff for: 323-Benfords Law/02-benford_population.py

+55
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,55 @@
1+
"""
2+
Benford's Law, also known as the first-digit law, is a statistical phenomenon
3+
observed in many sets of numerical data. It states that in certain naturally
4+
occurring datasets, the leading digits (1, 2, 3, etc.) occur with a higher frequency
5+
than larger digits (4, 5, 6, etc.). According to Benford's Law, the distribution
6+
of leading digits follows a logarithmic pattern, where smaller digits are more
7+
likely to be the first digit in a number. This surprising and counterintuitive
8+
property is frequently encountered in diverse datasets such as financial transactions,
9+
population numbers, and scientific data, making Benford's Law a useful tool for
10+
detecting anomalies and irregularities in numerical datasets.
11+
12+
In this python code, we analyze the distribution of leading digits in the population
13+
data, with the objective of verifying whether the data adheres to Benford's Law.
14+
15+
The observed frequencies of the leading digits are computed and compared against
16+
the expected frequencies predicted by Benford's Law.
17+
18+
19+
"""
20+
21+
import pandas as pd
22+
import numpy as np
23+
import seaborn as sns
24+
import matplotlib.pyplot as plt
25+
26+
# Load the Population Data from the CSV file
27+
population_df = pd.read_csv("data/population_data.csv")
28+
29+
# Extract the leading digit for each population and store it in a list
30+
leading_digits = [int(str(population).replace(',', '')[0]) for population in population_df['Population']]
31+
32+
# Calculate observed frequencies (counts of leading digits)
33+
observed_counts = [leading_digits.count(digit) for digit in range(1, 10)]
34+
35+
# Calculate the expected frequencies according to Benford's Law
36+
total_records = len(leading_digits)
37+
expected_counts = [int(total_records * np.log10(1 + 1/digit)) for digit in range(1, 10)]
38+
39+
# Define the digits (1 through 9) for the x-axis
40+
digits = range(1, 10)
41+
42+
# Create a line plot for observed frequencies
43+
sns.lineplot(x=digits, y=observed_counts, label='Observed')
44+
45+
# Create a line plot for expected frequencies
46+
sns.lineplot(x=digits, y=expected_counts, label='Expected')
47+
48+
# Add labels and a legend
49+
plt.xlabel('Leading Digit')
50+
plt.ylabel('Frequency')
51+
plt.title('Leading Digit Distribution (Population Data)')
52+
plt.legend()
53+
54+
# Show the plot
55+
plt.show()

Diff for: 323-Benfords Law/03-benford_gdp.py

+55
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,55 @@
1+
"""
2+
Benford's Law, also known as the first-digit law, is a statistical phenomenon
3+
observed in many sets of numerical data. It states that in certain naturally
4+
occurring datasets, the leading digits (1, 2, 3, etc.) occur with a higher frequency
5+
than larger digits (4, 5, 6, etc.). According to Benford's Law, the distribution
6+
of leading digits follows a logarithmic pattern, where smaller digits are more
7+
likely to be the first digit in a number. This surprising and counterintuitive
8+
property is frequently encountered in diverse datasets such as financial transactions,
9+
population numbers, and scientific data, making Benford's Law a useful tool for
10+
detecting anomalies and irregularities in numerical datasets.
11+
12+
In this python code, we analyze the distribution of leading digits in national GDP
13+
amounts, with the objective of verifying whether the data adheres to Benford's Law.
14+
15+
The observed frequencies of the leading digits are computed and compared against
16+
the expected frequencies predicted by Benford's Law.
17+
18+
19+
"""
20+
21+
import pandas as pd
22+
import numpy as np
23+
import seaborn as sns
24+
import matplotlib.pyplot as plt
25+
26+
# Load the GDP Data from the CSV file
27+
gdp_df = pd.read_csv("data/gdp_data.csv")
28+
29+
# Extract the leading digit for each GDP value and store it in a list
30+
leading_digits = [int(str(gdp).replace(',', '')[0]) for gdp in gdp_df['GDP (nominal, 2022)']]
31+
32+
# Calculate observed frequencies (counts of leading digits)
33+
observed_counts = [leading_digits.count(digit) for digit in range(1, 10)]
34+
35+
# Calculate the expected frequencies according to Benford's Law
36+
total_records = len(leading_digits)
37+
expected_counts = [int(total_records * np.log10(1 + 1/digit)) for digit in range(1, 10)]
38+
39+
# Define the digits (1 through 9) for the x-axis
40+
digits = range(1, 10)
41+
42+
# Create a line plot for observed frequencies
43+
sns.lineplot(x=digits, y=observed_counts, label='Observed')
44+
45+
# Create a line plot for expected frequencies
46+
sns.lineplot(x=digits, y=expected_counts, label='Expected')
47+
48+
# Add labels and a legend
49+
plt.xlabel('Leading Digit')
50+
plt.ylabel('Frequency')
51+
plt.title('Leading Digit Distribution (GDP Data)')
52+
plt.legend()
53+
54+
# Show the plot
55+
plt.show()

Diff for: 323-Benfords Law/04-benford_COVID.py

+85
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,85 @@
1+
"""
2+
Benford's Law, also known as the first-digit law, is a statistical phenomenon
3+
observed in many sets of numerical data. It states that in certain naturally
4+
occurring datasets, the leading digits (1, 2, 3, etc.) occur with a higher frequency
5+
than larger digits (4, 5, 6, etc.). According to Benford's Law, the distribution
6+
of leading digits follows a logarithmic pattern, where smaller digits are more
7+
likely to be the first digit in a number. This surprising and counterintuitive
8+
property is frequently encountered in diverse datasets such as financial transactions,
9+
population numbers, and scientific data, making Benford's Law a useful tool for
10+
detecting anomalies and irregularities in numerical datasets.
11+
12+
In this python code, we analyze the distribution of leading digits in COVID cases
13+
and deaths data, with the objective of verifying whether the data adheres to Benford's Law.
14+
15+
The observed frequencies of the leading digits are computed and compared against
16+
the expected frequencies predicted by Benford's Law.
17+
18+
19+
"""
20+
import pandas as pd
21+
import numpy as np
22+
import seaborn as sns
23+
import matplotlib.pyplot as plt
24+
25+
# Load the COVID Data from the CSV file
26+
covid_df = pd.read_csv("data/covid_data.csv")
27+
28+
# Verify Benford's Law for Total Cases
29+
total_cases_digits = [
30+
int(str(cases).replace(",", "")[0]) for cases in covid_df["Total Cases"]
31+
]
32+
observed_counts_total_cases = [
33+
total_cases_digits.count(digit) for digit in range(1, 10)
34+
]
35+
total_records_total_cases = len(total_cases_digits)
36+
expected_counts_total_cases = [
37+
int(total_records_total_cases * np.log10(1 + 1 / digit)) for digit in range(1, 10)
38+
]
39+
40+
# Verify Benford's Law for Total Deaths, handling 'nan' values
41+
total_deaths_digits = [
42+
int(str(deaths).replace(",", "")[0])
43+
if str(deaths).replace(",", "")[0] != "n"
44+
else None
45+
for deaths in covid_df["Total Deaths"]
46+
]
47+
observed_counts_total_deaths = [
48+
total_deaths_digits.count(digit) for digit in range(1, 10)
49+
]
50+
total_records_total_deaths = len([d for d in total_deaths_digits if d is not None])
51+
expected_counts_total_deaths = [
52+
int(total_records_total_deaths * np.log10(1 + 1 / digit)) for digit in range(1, 10)
53+
]
54+
55+
# Define the digits (1 through 9) for the x-axis
56+
digits = range(1, 10)
57+
58+
# Create a line plot for observed frequencies (Total Cases)
59+
sns.lineplot(x=digits, y=observed_counts_total_cases, label="Observed (Total Cases)")
60+
61+
# Create a line plot for expected frequencies (Total Cases)
62+
sns.lineplot(x=digits, y=expected_counts_total_cases, label="Expected (Total Cases)")
63+
# Add labels and a legend
64+
plt.xlabel("Leading Digit")
65+
plt.ylabel("Frequency")
66+
plt.title("Leading Digit Distribution (COVID Data)")
67+
plt.legend()
68+
69+
# Show the plot
70+
plt.show()
71+
72+
73+
# Create a line plot for observed frequencies (Total Deaths)
74+
sns.lineplot(x=digits, y=observed_counts_total_deaths, label="Observed (Total Deaths)")
75+
# Create a line plot for expected frequencies (Total Deaths)
76+
sns.lineplot(x=digits, y=expected_counts_total_deaths, label="Expected (Total Deaths)")
77+
78+
# Add labels and a legend
79+
plt.xlabel("Leading Digit")
80+
plt.ylabel("Frequency")
81+
plt.title("Leading Digit Distribution (COVID Data)")
82+
plt.legend()
83+
84+
# Show the plot
85+
plt.show()

Diff for: 323-Benfords Law/05-benford_images.py

+80
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,80 @@
1+
"""
2+
Verification of Benford's law in image data.
3+
4+
This code analyzes the leading digit distribution of Discrete Cosine Transform (DCT)
5+
coefficients in grayscale images. It is about verifying the Benford's Law to highlight
6+
the frequencies of leading digits (1 through 9) in the pixel values derived
7+
from DCT.
8+
9+
The code reads images in grayscale using opencv library, computes the DCT coefficients,
10+
and plots the observed Benford's Law distribution for each image.
11+
Then we plot the data to visualize and compare the occurrence of leading digits
12+
in different images.
13+
14+
In case you wonder why go through the pain of converting pixel values to DCT...
15+
16+
In the context of Benford's Law, the distribution of leading digits is expected
17+
to follow a logarithmic pattern, where smaller digits (1, 2, 3) occur more
18+
frequently than larger digits (4, 5, 6, 7, 8, 9).
19+
When pixel values are confined to a small range, it can disrupt this natural
20+
logarithmic distribution. For example, in 8 bit images, our pixels have values between
21+
0 to 255. So any bright pixel will always have a leading digit of 2 and never
22+
have values 3 or greater.
23+
24+
"""
25+
26+
27+
import cv2
28+
import numpy as np
29+
import matplotlib.pyplot as plt
30+
import seaborn as sns
31+
32+
def leading_digit(pixel_value):
33+
"""
34+
Helper function to extract the leading digit of a pixel value.
35+
"""
36+
while pixel_value >= 10:
37+
pixel_value //= 10
38+
return pixel_value
39+
40+
def calculate_observed_counts(coefficients):
41+
"""
42+
Calculate the observed leading digit frequencies from the given coefficients.
43+
"""
44+
leading_digits = [leading_digit(abs(coeff)) for coeff in coefficients]
45+
observed_counts = [leading_digits.count(digit) for digit in range(1, 10)]
46+
return observed_counts
47+
48+
def plot_observed_benford_law(image_paths, labels):
49+
"""
50+
Plot the observed Benford's Law distribution for a list of images.
51+
52+
Parameters:
53+
- image_paths (list): List of paths to image files.
54+
- labels (list): List of labels for each image.
55+
56+
"""
57+
plt.figure(figsize=(10, 6))
58+
59+
# Define custom colors for observed lines
60+
observed_colors = ['darkred', 'darkgreen', 'darkblue']
61+
62+
for i, image_path in enumerate(image_paths):
63+
image = cv2.imread(image_path, cv2.IMREAD_GRAYSCALE)
64+
dct_coefficients = cv2.dct(np.float32(image))
65+
selected_coefficients = dct_coefficients.flatten()
66+
67+
observed_counts = calculate_observed_counts(selected_coefficients)
68+
69+
sns.lineplot(x=range(1, 10), y=observed_counts, label=f'{labels[i]} - Observed', color=observed_colors[i])
70+
71+
plt.xlabel('Leading Digit')
72+
plt.ylabel('Frequency')
73+
plt.title('Leading Digit Distribution (Observed)')
74+
plt.legend()
75+
plt.show()
76+
77+
# Example usage:
78+
image_paths = ['data/sat_img1.jpg', 'data/sat_img2.jpg', 'data/sat_img3.jpg']
79+
labels = ['Image 1', 'Image 2', 'Image 3']
80+
plot_observed_benford_law(image_paths, labels)

Diff for: 323-Benfords Law/Benfords Law.pptx

446 KB
Binary file not shown.

0 commit comments

Comments
 (0)