Skip to content

Error was: Parser Error: syntax error at or near ")" #2647

@theimanph

Description

@theimanph

What happens?

In the code below, I generate some fake data then use Splink to identify the duplicates. Splink gives me this error: Error was: Parser Error: syntax error at or near "None" Here is the code (followed by the output and error): ############################################################

1) DATA GENERATION AND DUPLICATE INJECTION

############################################################

import pandas as pd
import random
from faker import Faker

fake = Faker()
Faker.seed(0)
random.seed(0)

n = 10000
data = []
for i in range(n):
first_name = fake.first_name()
last_name = fake.last_name()
dob = fake.date_of_birth(minimum_age=18, maximum_age=90).strftime("%Y-%m-%d")
ssn = fake.ssn()
street_address = fake.street_address()
apartment = fake.secondary_address() if random.random() < 0.5 else ""
city = "Los Angeles"
state = "California"
zipcode = fake.zipcode()
home_phone = fake.phone_number()
email = fake.email()
sex = random.choice(["M", "F"])
age = fake.random_int(min=18, max=90)
record_id = i + 1
ethnicity = random.choice(["Hispanic", "Non-Hispanic"])
reported_race = random.choice(["White", "Black", "Asian", "Other"])
create_date = fake.date_between(start_date='-10y', end_date='today').strftime("%Y-%m-%d")

data.append({
    "PER_FirstName": first_name,
    "PER_LastName": last_name,
    "PER_DOB": dob,
    "PER_SSN": ssn,
    "PER_StreetAddress": street_address,
    "PER_Apartment": apartment,
    "PER_City": city,
    "PER_State": state,
    "PER_Zip": zipcode,
    "PER_HomePhone": home_phone,
    "PER_Email": email,
    "PER_Sex": sex,
    "PER_Age": age,
    "id": record_id,
    "PER_Ethnicity": ethnicity,
    "PER_ReportedRace": reported_race,
    "PER_CreateDate": create_date
})

df = pd.DataFrame(data)

num_dups = int(n * 0.3)
dup_indices = random.sample(range(n), num_dups)
df_dup = df.iloc[dup_indices].copy()

df["group_id"] = df["id"]
df_dup["group_id"] = df_dup["id"]

df_full = pd.concat([df, df_dup], ignore_index=True)

group_counts = df_full["group_id"].value_counts()
df_full["match"] = df_full["group_id"].apply(lambda gid: "duplicate" if group_counts[gid] > 1 else "not duplicate")

df_full = df_full.sample(frac=1, random_state=0).reset_index(drop=True)

csv_path = "california_los_angeles_data.csv"
df_full.to_csv(csv_path, index=False)

print("Data generation complete. Saved to california_los_angeles_data.csv.")
print(df_full.head(2))

############################################################

2) SPLINK PIPELINE (SINGLE-LINE RULE, NO EXTRA PARENS)

############################################################

import splink.comparison_library as cl
from splink import DuckDBAPI, Linker, SettingsCreator, block_on
from splink.blocking_analysis import count_comparisons_from_blocking_rule

df_loaded = pd.read_csv(csv_path)

Rename "id" -> "unique_id" and zero-pad

df_loaded.rename(columns={"id": "unique_id"}, inplace=True)
df_loaded["unique_id"] = df_loaded["unique_id"].astype(str).str.zfill(8)
df_loaded["source_dataset"] = "A"

Clean fields

df_loaded["PER_FirstName_clean"] = df_loaded["PER_FirstName"].str.lower().str.strip().fillna("")
df_loaded["PER_LastName_clean"] = df_loaded["PER_LastName"].str.lower().str.strip().fillna("")
df_loaded["PER_DOB"] = pd.to_datetime(df_loaded["PER_DOB"]).dt.strftime("%Y-%m-%d")

db_api = DuckDBAPI()

Single-line combined rule, no extra parentheses:

- first name initial + last name initial + DOB

OR

- exact first name OR exact last name OR exact DOB

combined_rule = (
"substr(l.PER_FirstName_clean,1,1) = substr(r.PER_FirstName_clean,1,1) "
"AND substr(l.PER_LastName_clean,1,1) = substr(r.PER_LastName_clean,1,1) "
"AND l.PER_DOB = r.PER_DOB "
"OR "
"l.PER_FirstName_clean = r.PER_FirstName_clean "
"OR l.PER_LastName_clean = r.PER_LastName_clean "
"OR l.PER_DOB = r.PER_DOB"
)
print("\nSingle-line blocking rule:", combined_rule)

candidate_counts = count_comparisons_from_blocking_rule(
table_or_tables=df_loaded,
blocking_rule=combined_rule,
link_type="dedupe_only",
db_api=db_api,
unique_id_column_name="unique_id",
source_dataset_column_name="source_dataset"
)
print("Candidate pair counts for combined rule:")
print(candidate_counts)

Build settings

settings_dict = {
"link_type": "dedupe_only",
"unique_id_column_name": "unique_id",
"source_dataset_column_name": "source_dataset",
"column_info_settings": {
"unique_id_input_column": "unique_id",
"source_dataset_input_column": "source_dataset"
},
"comparisons": [
# Fuzzy name, street address, phone
{"comparison_name": "name_comparison", "col_name": "PER_FirstName_clean"},
{"comparison_name": "name_comparison", "col_name": "PER_LastName_clean"},
{"comparison_name": "jaro_winkler", "col_name": "PER_StreetAddress"},
{"comparison_name": "jaro_winkler", "col_name": "PER_HomePhone"},
# Email comparison
{"comparison_name": "email_comparison", "col_name": "PER_Email"},
# Exact for the rest
{"comparison_name": "exact_match", "col_name": "PER_Sex"},
{"comparison_name": "exact_match", "col_name": "PER_Ethnicity"},
{"comparison_name": "exact_match", "col_name": "PER_ReportedRace"}
],
# Only one rule
"blocking_rules_to_generate_predictions": [
combined_rule
]
}
settings_obj = SettingsCreator(settings_dict)

linker = Linker(df_loaded, settings_obj, db_api)
linker._settings_obj._link_type = "dedupe_only"

try:
# Probability that two random records match
linker.training.estimate_probability_two_random_records_match(
[combined_rule],
recall=0.7,
)
# Estimate u
linker.training.estimate_u_using_random_sampling(max_pairs=1e8)
# EM training
linker.training.estimate_parameters_using_expectation_maximisation(
combined_rule,
fix_m_probabilities=False,
fix_u_probabilities=False
)
except Exception as e:
print("EM Training Exception:", e)

try:
pairwise_predictions = linker.inference.predict(threshold_match_weight=-5)
clusters = linker.clustering.cluster_pairwise_predictions_at_threshold(pairwise_predictions, 0.9)
df_clusters = clusters.as_pandas_dataframe()
print("\nClustered results:")
print(df_clusters.head(5))
except Exception as e:
print("Prediction/Clustering Exception:", e)

here is the output: Data generation complete. Saved to california_los_angeles_data.csv.
PER_FirstName PER_LastName PER_DOB PER_SSN
0 Barbara Smith 1985-01-23 867-23-1140
1 Elizabeth Hernandez 1957-03-02 050-48-7043

             PER_StreetAddress PER_Apartment     PER_City   PER_State  \

0 1132 Garcia Spurs Apt. 120 Los Angeles California
1 95175 Stephanie Fields Apt. 438 Los Angeles California

PER_Zip PER_HomePhone PER_Email PER_Sex PER_Age
0 14431 568-301-0149x3773 bbrown@example.net M 43
1 88737 001-354-219-1363x3121 timothy75@example.com M 54

 id PER_Ethnicity PER_ReportedRace PER_CreateDate  group_id      match  

0 6842 Hispanic Black 2016-12-04 6842 duplicate
1 3818 Non-Hispanic White 2020-08-01 3818 duplicate

Single-line blocking rule: substr(l.PER_FirstName_clean,1,1) = substr(r.PER_FirstName_clean,1,1) AND substr(l.PER_LastName_clean,1,1) = substr(r.PER_LastName_clean,1,1) AND l.PER_DOB = r.PER_DOB OR l.PER_FirstName_clean = r.PER_FirstName_clean OR l.PER_LastName_clean = r.PER_LastName_clean OR l.PER_DOB = r.PER_DOB and here is the error: Candidate pair counts for combined rule:
{'number_of_comparisons_generated_pre_filter_conditions': 169000000, 'number_of_comparisons_to_be_scored_post_filter_conditions': 736905, 'filter_conditions_identified': 'l.per_dob = r.per_dob OR l.per_firstname_clean = r.per_firstname_clean OR l.per_lastname_clean = r.per_lastname_clean', 'equi_join_conditions_identified': '', 'link_type_join_condition': 'where l."source_dataset" || '--' || l."unique_id" < r."source_dataset" || '--' || r."unique_id"'}
'CREATE TABLE __splink__blocked_id_pairs_1b231c37d AS
WITH

__splink__df_concat_with_tf as (
select ' contains unsupported syntax. Falling back to parsing as a 'Command'.
EM Training Exception: Error executing the following sql for table __splink__df_count_cumulative_blocks(__splink__df_count_cumulative_blocks_c83cf81fa):
CREATE TABLE __splink__df_count_cumulative_blocks_c83cf81fa AS
WITH

__splink__df_concat as (
select "PER_FirstName", "PER_LastName", "PER_DOB", "PER_SSN", "PER_StreetAddress", "PER_Apartment", "PER_City", "PER_State", "PER_Zip", "PER_HomePhone", "PER_Email", "PER_Sex", "PER_Age", "unique_id", "PER_Ethnicity", "PER_ReportedRace", "PER_CreateDate", "group_id", "match", "source_dataset", "PER_FirstName_clean", "PER_LastName_clean"

        from __splink__input_table_0
        ), 

splink__blocked_id_pairs as (
select
'0' as match_key,
l."source_dataset" || '-
-' || l."unique_id" as join_key_l,
r."source_dataset" || '-__-' || r."unique_id" as join_key_r
from __splink__df_concat as l
inner join __splink__df_concat as r
on
(substr(l.PER_FirstName_clean,1,1) = substr(r.PER_FirstName_clean,1,1) AND substr(l.PER_LastName_clean,1,1) = substr(r.PER_LastName_clean,1,1) AND l.PER_DOB = r.PER_DOB OR l.PER_FirstName_clean = r.PER_FirstName_clean OR l.PER_LastName_clean = r.PER_LastName_clean OR l.PER_DOB = r.PER_DOB)
None

        ) 

    select
    count(*) as row_count,
    match_key
    from __splink__blocked_id_pairs
    group by match_key
    order by cast(match_key as int) asc

Error was: Parser Error: syntax error at or near "None"
Prediction/Clustering Exception: Error executing the following sql for table __splink__blocked_id_pairs(__splink__blocked_id_pairs_1b231c37d):
CREATE TABLE __splink__blocked_id_pairs_1b231c37d AS
WITH

__splink__df_concat_with_tf as (
select * from __splink__df_concat_with_tf_3be59fa6c)

        select
        '0' as match_key,
        l."source_dataset" || '-__-' || l."unique_id" as join_key_l,
        r."source_dataset" || '-__-' || r."unique_id" as join_key_r
        from __splink__df_concat_with_tf as l
        inner join __splink__df_concat_with_tf as r
        on
        (1=1)
        None

Error was: Parser Error: syntax error at or near "None"

To Reproduce

Run code.

OS:

Jupyter notebook on vsc.

Splink version:

4.0.7

Have you tried this on the latest master branch?

  • I agree

Have you tried the steps to reproduce? Do they include all relevant data and configuration? Does the issue you report still appear there?

  • I agree

Metadata

Metadata

Assignees

No one assigned

    Labels

    bugSomething isn't working

    Type

    No type

    Projects

    No projects

    Milestone

    No milestone

    Relationships

    None yet

    Development

    No branches or pull requests

    Issue actions