Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

[ENH] maintain sorted array for conditional join #1398

Merged
merged 30 commits into from
Oct 2, 2024
Merged
Changes from 1 commit
Commits
Show all changes
30 commits
Select commit Hold shift + click to select a range
03d15d9
more control on how the final dataframe is created
Sep 6, 2024
c5a59d6
fix doctests
Sep 7, 2024
f138af0
fix doctest for polars.pivot_longer
Sep 7, 2024
72f6e1a
fix bird_call.ipynb ruff
Sep 7, 2024
8b64a74
fix board_games.ipynb ruff
Sep 7, 2024
7a6e39b
fix complete.ipynb ruff
Sep 7, 2024
114bb37
fix french_trains.ipynb ruff
Sep 7, 2024
7148186
fix medium_franchise.ipynb ruff
Sep 7, 2024
8c4384d
fix teacher_pupil.ipynb
Sep 7, 2024
de3f958
fix scipy2019 slides ruff
Sep 7, 2024
8740209
add support for ragged arrays
Sep 7, 2024
de7958e
fix test failure
Sep 7, 2024
0f3a441
make equi join align with non-equi join numba
Sep 7, 2024
4fea89d
cleanup tests
Sep 7, 2024
814021a
fail early if df or right is empty
Sep 7, 2024
239ee29
fix doctest
Sep 7, 2024
1cbb37f
remove irrelevant comments
Sep 7, 2024
e76f38a
improve perf for equal indices
Sep 8, 2024
d3c5772
add TODO for equi joins
Sep 8, 2024
1fbd928
keep faster equi-join
Sep 8, 2024
f83546d
merge dev
Sep 17, 2024
57317b0
abstract sorted_array operation to separate function
Sep 17, 2024
bfea362
Merge branch 'dev' into samukweku/non-equi-join-improve
samukweku Sep 17, 2024
cc66145
minor refactor
Sep 18, 2024
0eaef6d
Merge dev into samukweku/non-equi-join-improve
ericmjl Sep 23, 2024
ce68d4e
Merge dev into samukweku/non-equi-join-improve
ericmjl Sep 23, 2024
3510269
Merge dev into samukweku/non-equi-join-improve
ericmjl Sep 28, 2024
a94f13b
Merge dev into samukweku/non-equi-join-improve
ericmjl Oct 1, 2024
3495271
switch to miniforge
Oct 1, 2024
c79e330
update miniconda setup
Oct 1, 2024
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
Prev Previous commit
Next Next commit
fix doctest
samuel.oranyeli committed Sep 7, 2024

Verified

This commit was created on GitHub.com and signed with GitHub’s verified signature. The key has expired.
commit 239ee294ab88a79e3a1d7f8f71c6e5e67f42a606
142 changes: 68 additions & 74 deletions janitor/functions/conditional_join.py
Original file line number Diff line number Diff line change
@@ -146,11 +146,11 @@ def conditional_join(
value_1 value_2B
0 2 3.0
1 5 6.0
2 7 NaN
3 1 NaN
4 3 4.0
5 4 5.0
6 4 6.0
2 3 4.0
3 4 5.0
4 4 6.0
5 7 NaN
6 1 NaN

Rename columns, before the join:
>>> (df1
@@ -163,13 +163,13 @@ def conditional_join(
... how='outer')
... )
left_column value_2B
0 7.0 NaN
1 1.0 NaN
2 2.0 3.0
3 5.0 6.0
4 3.0 4.0
5 4.0 5.0
6 4.0 6.0
0 2.0 3.0
1 5.0 6.0
2 3.0 4.0
3 4.0 5.0
4 4.0 6.0
5 7.0 NaN
6 1.0 NaN
7 NaN 1.0
8 NaN 9.0
9 NaN 15.0
@@ -209,18 +209,18 @@ def conditional_join(
... how='outer',
... indicator=True
... )
value_1 _merge value_2A value_2B
0 7.0 left_only NaN NaN
1 1.0 left_only NaN NaN
2 2.0 both 1.0 3.0
3 5.0 both 3.0 6.0
4 3.0 both 2.0 4.0
5 4.0 both 3.0 5.0
6 4.0 both 3.0 6.0
7 NaN right_only 0.0 1.0
8 NaN right_only 7.0 9.0
9 NaN right_only 12.0 15.0
10 NaN right_only 0.0 1.0
value_1 value_2A value_2B _merge
0 2.0 1.0 3.0 both
1 5.0 3.0 6.0 both
2 3.0 2.0 4.0 both
3 4.0 3.0 5.0 both
4 4.0 3.0 6.0 both
5 7.0 NaN NaN left_only
6 1.0 NaN NaN left_only
7 NaN 0.0 1.0 right_only
8 NaN 7.0 9.0 right_only
9 NaN 12.0 15.0 right_only
10 NaN 0.0 1.0 right_only

!!! abstract "Version Changed"

@@ -1226,20 +1226,20 @@ def _inner(
Returns:
An inner joined DataFrame.
"""
frame = {key: value._values[left_index] for key, value in df.items()}
r_frame = {
key: value._values[right_index] for key, value in right.items()
}
frame.update(r_frame)
dictionary = {}
for key, value in df.items():
dictionary[key] = value._values[left_index]
for key, value in right.items():
dictionary[key] = value._values[right_index]
if indicator:
indicator, arr = _add_indicator(
indicator=indicator,
how="inner",
column_length=left_index.size,
columns=df.columns.union(right.columns),
)
frame[indicator] = arr
return pd.DataFrame(frame, copy=False)
dictionary[indicator] = arr
return pd.DataFrame(dictionary, copy=False)

if how == "inner":
return _inner(
@@ -1262,16 +1262,21 @@ def _inner(
right_index=right_index,
indicator=indicator,
)

right_dict = {}
dictionary = {}
for key, value in df.items():
array = value._values
top = array[left_index]
bottom = array[indexer]
value = concat_compat([top, bottom])
dictionary[key] = value
for key, value in right.items():
array = value._values
value = array[right_index]
other = construct_1d_array_from_inferred_fill_value(
value=array[:1], length=length
)
value = concat_compat([value, other])
right_dict[key] = value
dictionary[key] = value
if indicator:
columns = df.columns.union(right.columns)
name, arr1 = _add_indicator(
@@ -1287,16 +1292,8 @@ def _inner(
columns=columns,
)
value = concat_compat([arr1, arr2])
right_dict[name] = value
left_dict = {}
for key, value in df.items():
array = value._values
top = array[left_index]
bottom = array[indexer]
value = concat_compat([top, bottom])
left_dict[key] = value
left_dict.update(right_dict)
return pd.DataFrame(left_dict, copy=False)
dictionary[name] = value
return pd.DataFrame(dictionary, copy=False)

if how == "right":
indexer = pd.unique(right_index)
@@ -1311,22 +1308,21 @@ def _inner(
right_index=right_index,
indicator=indicator,
)
left_dict = {}
dictionary = {}
for key, value in df.items():
array = value._values
value = array[left_index]
other = construct_1d_array_from_inferred_fill_value(
value=array[:1], length=length
)
value = concat_compat([value, other])
left_dict[key] = value
right_dict = {}
dictionary[key] = value
for key, value in right.items():
array = value._values
top = array[right_index]
bottom = array[indexer]
value = concat_compat([top, bottom])
right_dict[key] = value
dictionary[key] = value
if indicator:
columns = df.columns.union(right.columns)
name, arr1 = _add_indicator(
@@ -1342,9 +1338,8 @@ def _inner(
columns=columns,
)
value = concat_compat([arr1, arr2])
right_dict[name] = value
left_dict.update(right_dict)
return pd.DataFrame(left_dict, copy=False)
dictionary[name] = value
return pd.DataFrame(dictionary, copy=False)
# how == 'outer'
left_indexer = pd.unique(left_index)
left_indexer = pd.Index(left_indexer).get_indexer(range(len(df)))
@@ -1355,7 +1350,24 @@ def _inner(

df_nulls_length = left_indexer.size
right_nulls_length = right_indexer.size
right_dict = {}
dictionary = {}
for key, value in df.items():
array = value._values
top = array[left_index]
top = [top]
if df_nulls_length:
middle = array[left_indexer]
top.append(middle)
if right_nulls_length:
bottom = construct_1d_array_from_inferred_fill_value(
value=array[:1], length=right_nulls_length
)
top.append(bottom)
if len(top) == 1:
top = top[0]
else:
top = concat_compat(top)
dictionary[key] = top
for key, value in right.items():
array = value._values
top = array[right_index]
@@ -1372,7 +1384,7 @@ def _inner(
top = top[0]
else:
top = concat_compat(top)
right_dict[key] = top
dictionary[key] = top
if indicator:
columns = df.columns.union(right.columns)
name, arr1 = _add_indicator(
@@ -1402,27 +1414,9 @@ def _inner(
arr1 = arr1[0]
else:
arr1 = concat_compat(arr1)
right_dict[name] = arr1
left_dict = {}
for key, value in df.items():
array = value._values
top = array[left_index]
top = [top]
if df_nulls_length:
middle = array[left_indexer]
top.append(middle)
if right_nulls_length:
bottom = construct_1d_array_from_inferred_fill_value(
value=array[:1], length=right_nulls_length
)
top.append(bottom)
if len(top) == 1:
top = top[0]
else:
top = concat_compat(top)
right_dict[key] = top
left_dict.update(right_dict)
return pd.DataFrame(left_dict, copy=False)
dictionary[name] = arr1

return pd.DataFrame(dictionary, copy=False)


def get_join_indices(