What happens?
In the code below, I generate some fake data then use Splink to identify the duplicates. Splink gives me this error: Error was: Parser Error: syntax error at or near "None" Here is the code (followed by the output and error): ############################################################
1) DATA GENERATION AND DUPLICATE INJECTION
############################################################
import pandas as pd
import random
from faker import Faker
fake = Faker()
Faker.seed(0)
random.seed(0)
n = 10000
data = []
for i in range(n):
first_name = fake.first_name()
last_name = fake.last_name()
dob = fake.date_of_birth(minimum_age=18, maximum_age=90).strftime("%Y-%m-%d")
ssn = fake.ssn()
street_address = fake.street_address()
apartment = fake.secondary_address() if random.random() < 0.5 else ""
city = "Los Angeles"
state = "California"
zipcode = fake.zipcode()
home_phone = fake.phone_number()
email = fake.email()
sex = random.choice(["M", "F"])
age = fake.random_int(min=18, max=90)
record_id = i + 1
ethnicity = random.choice(["Hispanic", "Non-Hispanic"])
reported_race = random.choice(["White", "Black", "Asian", "Other"])
create_date = fake.date_between(start_date='-10y', end_date='today').strftime("%Y-%m-%d")
data.append({
"PER_FirstName": first_name,
"PER_LastName": last_name,
"PER_DOB": dob,
"PER_SSN": ssn,
"PER_StreetAddress": street_address,
"PER_Apartment": apartment,
"PER_City": city,
"PER_State": state,
"PER_Zip": zipcode,
"PER_HomePhone": home_phone,
"PER_Email": email,
"PER_Sex": sex,
"PER_Age": age,
"id": record_id,
"PER_Ethnicity": ethnicity,
"PER_ReportedRace": reported_race,
"PER_CreateDate": create_date
})
df = pd.DataFrame(data)
num_dups = int(n * 0.3)
dup_indices = random.sample(range(n), num_dups)
df_dup = df.iloc[dup_indices].copy()
df["group_id"] = df["id"]
df_dup["group_id"] = df_dup["id"]
df_full = pd.concat([df, df_dup], ignore_index=True)
group_counts = df_full["group_id"].value_counts()
df_full["match"] = df_full["group_id"].apply(lambda gid: "duplicate" if group_counts[gid] > 1 else "not duplicate")
df_full = df_full.sample(frac=1, random_state=0).reset_index(drop=True)
csv_path = "california_los_angeles_data.csv"
df_full.to_csv(csv_path, index=False)
print("Data generation complete. Saved to california_los_angeles_data.csv.")
print(df_full.head(2))
############################################################
2) SPLINK PIPELINE (SINGLE-LINE RULE, NO EXTRA PARENS)
############################################################
import splink.comparison_library as cl
from splink import DuckDBAPI, Linker, SettingsCreator, block_on
from splink.blocking_analysis import count_comparisons_from_blocking_rule
df_loaded = pd.read_csv(csv_path)
Rename "id" -> "unique_id" and zero-pad
df_loaded.rename(columns={"id": "unique_id"}, inplace=True)
df_loaded["unique_id"] = df_loaded["unique_id"].astype(str).str.zfill(8)
df_loaded["source_dataset"] = "A"
Clean fields
df_loaded["PER_FirstName_clean"] = df_loaded["PER_FirstName"].str.lower().str.strip().fillna("")
df_loaded["PER_LastName_clean"] = df_loaded["PER_LastName"].str.lower().str.strip().fillna("")
df_loaded["PER_DOB"] = pd.to_datetime(df_loaded["PER_DOB"]).dt.strftime("%Y-%m-%d")
db_api = DuckDBAPI()
Single-line combined rule, no extra parentheses:
- first name initial + last name initial + DOB
OR
- exact first name OR exact last name OR exact DOB
combined_rule = (
"substr(l.PER_FirstName_clean,1,1) = substr(r.PER_FirstName_clean,1,1) "
"AND substr(l.PER_LastName_clean,1,1) = substr(r.PER_LastName_clean,1,1) "
"AND l.PER_DOB = r.PER_DOB "
"OR "
"l.PER_FirstName_clean = r.PER_FirstName_clean "
"OR l.PER_LastName_clean = r.PER_LastName_clean "
"OR l.PER_DOB = r.PER_DOB"
)
print("\nSingle-line blocking rule:", combined_rule)
candidate_counts = count_comparisons_from_blocking_rule(
table_or_tables=df_loaded,
blocking_rule=combined_rule,
link_type="dedupe_only",
db_api=db_api,
unique_id_column_name="unique_id",
source_dataset_column_name="source_dataset"
)
print("Candidate pair counts for combined rule:")
print(candidate_counts)
Build settings
settings_dict = {
"link_type": "dedupe_only",
"unique_id_column_name": "unique_id",
"source_dataset_column_name": "source_dataset",
"column_info_settings": {
"unique_id_input_column": "unique_id",
"source_dataset_input_column": "source_dataset"
},
"comparisons": [
# Fuzzy name, street address, phone
{"comparison_name": "name_comparison", "col_name": "PER_FirstName_clean"},
{"comparison_name": "name_comparison", "col_name": "PER_LastName_clean"},
{"comparison_name": "jaro_winkler", "col_name": "PER_StreetAddress"},
{"comparison_name": "jaro_winkler", "col_name": "PER_HomePhone"},
# Email comparison
{"comparison_name": "email_comparison", "col_name": "PER_Email"},
# Exact for the rest
{"comparison_name": "exact_match", "col_name": "PER_Sex"},
{"comparison_name": "exact_match", "col_name": "PER_Ethnicity"},
{"comparison_name": "exact_match", "col_name": "PER_ReportedRace"}
],
# Only one rule
"blocking_rules_to_generate_predictions": [
combined_rule
]
}
settings_obj = SettingsCreator(settings_dict)
linker = Linker(df_loaded, settings_obj, db_api)
linker._settings_obj._link_type = "dedupe_only"
try:
# Probability that two random records match
linker.training.estimate_probability_two_random_records_match(
[combined_rule],
recall=0.7,
)
# Estimate u
linker.training.estimate_u_using_random_sampling(max_pairs=1e8)
# EM training
linker.training.estimate_parameters_using_expectation_maximisation(
combined_rule,
fix_m_probabilities=False,
fix_u_probabilities=False
)
except Exception as e:
print("EM Training Exception:", e)
try:
pairwise_predictions = linker.inference.predict(threshold_match_weight=-5)
clusters = linker.clustering.cluster_pairwise_predictions_at_threshold(pairwise_predictions, 0.9)
df_clusters = clusters.as_pandas_dataframe()
print("\nClustered results:")
print(df_clusters.head(5))
except Exception as e:
print("Prediction/Clustering Exception:", e)
here is the output: Data generation complete. Saved to california_los_angeles_data.csv.
PER_FirstName PER_LastName PER_DOB PER_SSN
0 Barbara Smith 1985-01-23 867-23-1140
1 Elizabeth Hernandez 1957-03-02 050-48-7043
PER_StreetAddress PER_Apartment PER_City PER_State \
0 1132 Garcia Spurs Apt. 120 Los Angeles California
1 95175 Stephanie Fields Apt. 438 Los Angeles California
PER_Zip PER_HomePhone PER_Email PER_Sex PER_Age
0 14431 568-301-0149x3773 bbrown@example.net M 43
1 88737 001-354-219-1363x3121 timothy75@example.com M 54
id PER_Ethnicity PER_ReportedRace PER_CreateDate group_id match
0 6842 Hispanic Black 2016-12-04 6842 duplicate
1 3818 Non-Hispanic White 2020-08-01 3818 duplicate
Single-line blocking rule: substr(l.PER_FirstName_clean,1,1) = substr(r.PER_FirstName_clean,1,1) AND substr(l.PER_LastName_clean,1,1) = substr(r.PER_LastName_clean,1,1) AND l.PER_DOB = r.PER_DOB OR l.PER_FirstName_clean = r.PER_FirstName_clean OR l.PER_LastName_clean = r.PER_LastName_clean OR l.PER_DOB = r.PER_DOB and here is the error: Candidate pair counts for combined rule:
{'number_of_comparisons_generated_pre_filter_conditions': 169000000, 'number_of_comparisons_to_be_scored_post_filter_conditions': 736905, 'filter_conditions_identified': 'l.per_dob = r.per_dob OR l.per_firstname_clean = r.per_firstname_clean OR l.per_lastname_clean = r.per_lastname_clean', 'equi_join_conditions_identified': '', 'link_type_join_condition': 'where l."source_dataset" || '--' || l."unique_id" < r."source_dataset" || '--' || r."unique_id"'}
'CREATE TABLE __splink__blocked_id_pairs_1b231c37d AS
WITH
__splink__df_concat_with_tf as (
select ' contains unsupported syntax. Falling back to parsing as a 'Command'.
EM Training Exception: Error executing the following sql for table __splink__df_count_cumulative_blocks(__splink__df_count_cumulative_blocks_c83cf81fa):
CREATE TABLE __splink__df_count_cumulative_blocks_c83cf81fa AS
WITH
__splink__df_concat as (
select "PER_FirstName", "PER_LastName", "PER_DOB", "PER_SSN", "PER_StreetAddress", "PER_Apartment", "PER_City", "PER_State", "PER_Zip", "PER_HomePhone", "PER_Email", "PER_Sex", "PER_Age", "unique_id", "PER_Ethnicity", "PER_ReportedRace", "PER_CreateDate", "group_id", "match", "source_dataset", "PER_FirstName_clean", "PER_LastName_clean"
from __splink__input_table_0
),
splink__blocked_id_pairs as (
select
'0' as match_key,
l."source_dataset" || '--' || l."unique_id" as join_key_l,
r."source_dataset" || '-__-' || r."unique_id" as join_key_r
from __splink__df_concat as l
inner join __splink__df_concat as r
on
(substr(l.PER_FirstName_clean,1,1) = substr(r.PER_FirstName_clean,1,1) AND substr(l.PER_LastName_clean,1,1) = substr(r.PER_LastName_clean,1,1) AND l.PER_DOB = r.PER_DOB OR l.PER_FirstName_clean = r.PER_FirstName_clean OR l.PER_LastName_clean = r.PER_LastName_clean OR l.PER_DOB = r.PER_DOB)
None
)
select
count(*) as row_count,
match_key
from __splink__blocked_id_pairs
group by match_key
order by cast(match_key as int) asc
Error was: Parser Error: syntax error at or near "None"
Prediction/Clustering Exception: Error executing the following sql for table __splink__blocked_id_pairs(__splink__blocked_id_pairs_1b231c37d):
CREATE TABLE __splink__blocked_id_pairs_1b231c37d AS
WITH
__splink__df_concat_with_tf as (
select * from __splink__df_concat_with_tf_3be59fa6c)
select
'0' as match_key,
l."source_dataset" || '-__-' || l."unique_id" as join_key_l,
r."source_dataset" || '-__-' || r."unique_id" as join_key_r
from __splink__df_concat_with_tf as l
inner join __splink__df_concat_with_tf as r
on
(1=1)
None
Error was: Parser Error: syntax error at or near "None"
To Reproduce
Run code.
OS:
Jupyter notebook on vsc.
Splink version:
4.0.7
Have you tried this on the latest master branch?
Have you tried the steps to reproduce? Do they include all relevant data and configuration? Does the issue you report still appear there?
What happens?
In the code below, I generate some fake data then use Splink to identify the duplicates. Splink gives me this error: Error was: Parser Error: syntax error at or near "None" Here is the code (followed by the output and error): ############################################################
1) DATA GENERATION AND DUPLICATE INJECTION
############################################################
import pandas as pd
import random
from faker import Faker
fake = Faker()
Faker.seed(0)
random.seed(0)
n = 10000
data = []
for i in range(n):
first_name = fake.first_name()
last_name = fake.last_name()
dob = fake.date_of_birth(minimum_age=18, maximum_age=90).strftime("%Y-%m-%d")
ssn = fake.ssn()
street_address = fake.street_address()
apartment = fake.secondary_address() if random.random() < 0.5 else ""
city = "Los Angeles"
state = "California"
zipcode = fake.zipcode()
home_phone = fake.phone_number()
email = fake.email()
sex = random.choice(["M", "F"])
age = fake.random_int(min=18, max=90)
record_id = i + 1
ethnicity = random.choice(["Hispanic", "Non-Hispanic"])
reported_race = random.choice(["White", "Black", "Asian", "Other"])
create_date = fake.date_between(start_date='-10y', end_date='today').strftime("%Y-%m-%d")
df = pd.DataFrame(data)
num_dups = int(n * 0.3)
dup_indices = random.sample(range(n), num_dups)
df_dup = df.iloc[dup_indices].copy()
df["group_id"] = df["id"]
df_dup["group_id"] = df_dup["id"]
df_full = pd.concat([df, df_dup], ignore_index=True)
group_counts = df_full["group_id"].value_counts()
df_full["match"] = df_full["group_id"].apply(lambda gid: "duplicate" if group_counts[gid] > 1 else "not duplicate")
df_full = df_full.sample(frac=1, random_state=0).reset_index(drop=True)
csv_path = "california_los_angeles_data.csv"
df_full.to_csv(csv_path, index=False)
print("Data generation complete. Saved to california_los_angeles_data.csv.")
print(df_full.head(2))
############################################################
2) SPLINK PIPELINE (SINGLE-LINE RULE, NO EXTRA PARENS)
############################################################
import splink.comparison_library as cl
from splink import DuckDBAPI, Linker, SettingsCreator, block_on
from splink.blocking_analysis import count_comparisons_from_blocking_rule
df_loaded = pd.read_csv(csv_path)
Rename "id" -> "unique_id" and zero-pad
df_loaded.rename(columns={"id": "unique_id"}, inplace=True)
df_loaded["unique_id"] = df_loaded["unique_id"].astype(str).str.zfill(8)
df_loaded["source_dataset"] = "A"
Clean fields
df_loaded["PER_FirstName_clean"] = df_loaded["PER_FirstName"].str.lower().str.strip().fillna("")
df_loaded["PER_LastName_clean"] = df_loaded["PER_LastName"].str.lower().str.strip().fillna("")
df_loaded["PER_DOB"] = pd.to_datetime(df_loaded["PER_DOB"]).dt.strftime("%Y-%m-%d")
db_api = DuckDBAPI()
Single-line combined rule, no extra parentheses:
- first name initial + last name initial + DOB
OR
- exact first name OR exact last name OR exact DOB
combined_rule = (
"substr(l.PER_FirstName_clean,1,1) = substr(r.PER_FirstName_clean,1,1) "
"AND substr(l.PER_LastName_clean,1,1) = substr(r.PER_LastName_clean,1,1) "
"AND l.PER_DOB = r.PER_DOB "
"OR "
"l.PER_FirstName_clean = r.PER_FirstName_clean "
"OR l.PER_LastName_clean = r.PER_LastName_clean "
"OR l.PER_DOB = r.PER_DOB"
)
print("\nSingle-line blocking rule:", combined_rule)
candidate_counts = count_comparisons_from_blocking_rule(
table_or_tables=df_loaded,
blocking_rule=combined_rule,
link_type="dedupe_only",
db_api=db_api,
unique_id_column_name="unique_id",
source_dataset_column_name="source_dataset"
)
print("Candidate pair counts for combined rule:")
print(candidate_counts)
Build settings
settings_dict = {
"link_type": "dedupe_only",
"unique_id_column_name": "unique_id",
"source_dataset_column_name": "source_dataset",
"column_info_settings": {
"unique_id_input_column": "unique_id",
"source_dataset_input_column": "source_dataset"
},
"comparisons": [
# Fuzzy name, street address, phone
{"comparison_name": "name_comparison", "col_name": "PER_FirstName_clean"},
{"comparison_name": "name_comparison", "col_name": "PER_LastName_clean"},
{"comparison_name": "jaro_winkler", "col_name": "PER_StreetAddress"},
{"comparison_name": "jaro_winkler", "col_name": "PER_HomePhone"},
# Email comparison
{"comparison_name": "email_comparison", "col_name": "PER_Email"},
# Exact for the rest
{"comparison_name": "exact_match", "col_name": "PER_Sex"},
{"comparison_name": "exact_match", "col_name": "PER_Ethnicity"},
{"comparison_name": "exact_match", "col_name": "PER_ReportedRace"}
],
# Only one rule
"blocking_rules_to_generate_predictions": [
combined_rule
]
}
settings_obj = SettingsCreator(settings_dict)
linker = Linker(df_loaded, settings_obj, db_api)
linker._settings_obj._link_type = "dedupe_only"
try:
# Probability that two random records match
linker.training.estimate_probability_two_random_records_match(
[combined_rule],
recall=0.7,
)
# Estimate u
linker.training.estimate_u_using_random_sampling(max_pairs=1e8)
# EM training
linker.training.estimate_parameters_using_expectation_maximisation(
combined_rule,
fix_m_probabilities=False,
fix_u_probabilities=False
)
except Exception as e:
print("EM Training Exception:", e)
try:
pairwise_predictions = linker.inference.predict(threshold_match_weight=-5)
clusters = linker.clustering.cluster_pairwise_predictions_at_threshold(pairwise_predictions, 0.9)
df_clusters = clusters.as_pandas_dataframe()
print("\nClustered results:")
print(df_clusters.head(5))
except Exception as e:
print("Prediction/Clustering Exception:", e)
here is the output: Data generation complete. Saved to california_los_angeles_data.csv.
PER_FirstName PER_LastName PER_DOB PER_SSN
0 Barbara Smith 1985-01-23 867-23-1140
1 Elizabeth Hernandez 1957-03-02 050-48-7043
0 1132 Garcia Spurs Apt. 120 Los Angeles California
1 95175 Stephanie Fields Apt. 438 Los Angeles California
PER_Zip PER_HomePhone PER_Email PER_Sex PER_Age
0 14431 568-301-0149x3773 bbrown@example.net M 43
1 88737 001-354-219-1363x3121 timothy75@example.com M 54
0 6842 Hispanic Black 2016-12-04 6842 duplicate
1 3818 Non-Hispanic White 2020-08-01 3818 duplicate
Single-line blocking rule: substr(l.PER_FirstName_clean,1,1) = substr(r.PER_FirstName_clean,1,1) AND substr(l.PER_LastName_clean,1,1) = substr(r.PER_LastName_clean,1,1) AND l.PER_DOB = r.PER_DOB OR l.PER_FirstName_clean = r.PER_FirstName_clean OR l.PER_LastName_clean = r.PER_LastName_clean OR l.PER_DOB = r.PER_DOB and here is the error: Candidate pair counts for combined rule:
{'number_of_comparisons_generated_pre_filter_conditions': 169000000, 'number_of_comparisons_to_be_scored_post_filter_conditions': 736905, 'filter_conditions_identified': 'l.per_dob = r.per_dob OR l.per_firstname_clean = r.per_firstname_clean OR l.per_lastname_clean = r.per_lastname_clean', 'equi_join_conditions_identified': '', 'link_type_join_condition': 'where l."source_dataset" || '--' || l."unique_id" < r."source_dataset" || '--' || r."unique_id"'}
'CREATE TABLE __splink__blocked_id_pairs_1b231c37d AS
WITH
__splink__df_concat_with_tf as (
select ' contains unsupported syntax. Falling back to parsing as a 'Command'.
EM Training Exception: Error executing the following sql for table
__splink__df_count_cumulative_blocks(__splink__df_count_cumulative_blocks_c83cf81fa):CREATE TABLE __splink__df_count_cumulative_blocks_c83cf81fa AS
WITH
__splink__df_concat as (
select "PER_FirstName", "PER_LastName", "PER_DOB", "PER_SSN", "PER_StreetAddress", "PER_Apartment", "PER_City", "PER_State", "PER_Zip", "PER_HomePhone", "PER_Email", "PER_Sex", "PER_Age", "unique_id", "PER_Ethnicity", "PER_ReportedRace", "PER_CreateDate", "group_id", "match", "source_dataset", "PER_FirstName_clean", "PER_LastName_clean"
splink__blocked_id_pairs as (
select
'0' as match_key,
l."source_dataset" || '--' || l."unique_id" as join_key_l,
r."source_dataset" || '-__-' || r."unique_id" as join_key_r
from __splink__df_concat as l
inner join __splink__df_concat as r
on
(substr(l.PER_FirstName_clean,1,1) = substr(r.PER_FirstName_clean,1,1) AND substr(l.PER_LastName_clean,1,1) = substr(r.PER_LastName_clean,1,1) AND l.PER_DOB = r.PER_DOB OR l.PER_FirstName_clean = r.PER_FirstName_clean OR l.PER_LastName_clean = r.PER_LastName_clean OR l.PER_DOB = r.PER_DOB)
None
Error was: Parser Error: syntax error at or near "None"
Prediction/Clustering Exception: Error executing the following sql for table
__splink__blocked_id_pairs(__splink__blocked_id_pairs_1b231c37d):CREATE TABLE __splink__blocked_id_pairs_1b231c37d AS
WITH
__splink__df_concat_with_tf as (
select * from __splink__df_concat_with_tf_3be59fa6c)
Error was: Parser Error: syntax error at or near "None"
To Reproduce
Run code.
OS:
Jupyter notebook on vsc.
Splink version:
4.0.7
Have you tried this on the latest
masterbranch?Have you tried the steps to reproduce? Do they include all relevant data and configuration? Does the issue you report still appear there?