Skip to content

Commit 9812d84

Browse files
Bidek56dchrostowski
and
dchrostowski
authored
Allowing expr in contains (pola-rs#312)
Allowing expr in contains to match Python syntax to close pola-rs#311 Co-authored-by: dchrostowski <[email protected]>
1 parent bcff890 commit 9812d84

File tree

5 files changed

+135
-27
lines changed

5 files changed

+135
-27
lines changed

__tests__/expr.test.ts

+26-2
Original file line numberDiff line numberDiff line change
@@ -1034,7 +1034,19 @@ describe("expr.str", () => {
10341034
expect(actual).toFrameEqual(expected);
10351035
expect(seriesActual).toSeriesEqual(expected.getColumn("isLinux"));
10361036
});
1037-
1037+
test("contains:expr", () => {
1038+
const df = pl.DataFrame({
1039+
os: ["linux-kali", "linux-debian", "windows-vista"],
1040+
name: ["kali", "debian", "macos"],
1041+
});
1042+
const expected = df.withColumn(
1043+
pl.Series("isLinux", [true, true, false], pl.Bool),
1044+
);
1045+
const actual = df.withColumn(
1046+
col("os").str.contains(pl.col("name")).as("isLinux"),
1047+
);
1048+
expect(actual).toFrameEqual(expected);
1049+
});
10381050
test("contains:regex", () => {
10391051
const df = pl.DataFrame({
10401052
a: ["Foo", "foo", "FoO"],
@@ -1050,7 +1062,19 @@ describe("expr.str", () => {
10501062
expect(actual).toFrameEqual(expected);
10511063
expect(seriesActual).toSeriesEqual(expected.getColumn("contains"));
10521064
});
1053-
1065+
test("contains:regex2", () => {
1066+
const df = pl.DataFrame({ txt: ["Crab", "cat and dog", "rab$bit", null] });
1067+
const actual = df.select(
1068+
pl.col("txt"),
1069+
pl.col("txt").str.contains("cat|bit").alias("regex"),
1070+
pl.col("txt").str.contains("rab$", true).alias("literal"),
1071+
);
1072+
const expected = df.withColumns(
1073+
pl.Series("regex", [false, true, true, null], pl.Bool),
1074+
pl.Series("literal", [false, false, true, null], pl.Bool),
1075+
);
1076+
expect(actual).toFrameEqual(expected);
1077+
});
10541078
test("split", () => {
10551079
const df = pl.DataFrame({ a: ["ab,cd", "e,fg", "h"] });
10561080
const expected = pl.DataFrame({

polars/lazy/expr/string.ts

+34-4
Original file line numberDiff line numberDiff line change
@@ -26,8 +26,38 @@ export interface StringNamespace extends StringFunctions<Expr> {
2626
* ```
2727
*/
2828
concat(delimiter: string, ignoreNulls?: boolean): Expr;
29-
/** Check if strings in Series contain regex pattern. */
30-
contains(pat: string | RegExp): Expr;
29+
/**
30+
* Check if strings in Series contain a substring that matches a pattern.
31+
* @param pat A valid regular expression pattern, compatible with the `regex crate
32+
* @param literal Treat `pattern` as a literal string, not as a regular expression.
33+
* @param strict Raise an error if the underlying pattern is not a valid regex, otherwise mask out with a null value.
34+
* @returns Boolean mask
35+
* @example
36+
* ```
37+
* const df = pl.DataFrame({"txt": ["Crab", "cat and dog", "rab$bit", null]})
38+
* df.select(
39+
* ... pl.col("txt"),
40+
* ... pl.col("txt").str.contains("cat|bit").alias("regex"),
41+
* ... pl.col("txt").str.contains("rab$", true).alias("literal"),
42+
* ... )
43+
* shape: (4, 3)
44+
* ┌─────────────┬───────┬─────────┐
45+
* │ txt ┆ regex ┆ literal │
46+
* │ --- ┆ --- ┆ --- │
47+
* │ str ┆ bool ┆ bool │
48+
* ╞═════════════╪═══════╪═════════╡
49+
* │ Crab ┆ false ┆ false │
50+
* │ cat and dog ┆ true ┆ false │
51+
* │ rab$bit ┆ true ┆ true │
52+
* │ null ┆ null ┆ null │
53+
* └─────────────┴───────┴─────────┘
54+
* ```
55+
*/
56+
contains(
57+
pat: string | RegExp | Expr,
58+
literal?: boolean,
59+
strict?: boolean,
60+
): Expr;
3161
/**
3262
* Decodes a value using the provided encoding
3363
* @param encoding - hex | base64
@@ -321,8 +351,8 @@ export const ExprStringFunctions = (_expr: any): StringNamespace => {
321351
concat(delimiter: string, ignoreNulls = true) {
322352
return wrap("strConcat", delimiter, ignoreNulls);
323353
},
324-
contains(pat: string | RegExp) {
325-
return wrap("strContains", regexToString(pat), false);
354+
contains(pat: string | Expr, literal = false, strict = true) {
355+
return wrap("strContains", exprToLitOrExpr(pat)._expr, literal, strict);
326356
},
327357
decode(arg, strict = false) {
328358
if (typeof arg === "string") {

polars/series/string.ts

+32-6
Original file line numberDiff line numberDiff line change
@@ -3,7 +3,7 @@ import type { DataType } from "../datatypes";
33
import { col } from "../lazy/functions";
44
import type { StringFunctions } from "../shared_traits";
55
import { regexToString } from "../utils";
6-
import type { Expr } from "./../lazy/expr/index";
6+
import { type Expr, exprToLitOrExpr } from "./../lazy/expr/index";
77

88
/**
99
* namespace containing series string functions
@@ -19,11 +19,37 @@ export interface StringNamespace extends StringFunctions<Series> {
1919
*/
2020
concat(delimiter: string, ignoreNulls?: boolean): Series;
2121
/**
22-
* Check if strings in Series contain regex pattern.
23-
* @param pattern A valid regex pattern
22+
* Check if strings in Series contain a substring that matches a pattern.
23+
* @param pat A valid regular expression pattern, compatible with the `regex crate
24+
* @param literal Treat `pattern` as a literal string, not as a regular expression.
25+
* @param strict Raise an error if the underlying pattern is not a valid regex, otherwise mask out with a null value.
2426
* @returns Boolean mask
27+
* @example
28+
* ```
29+
* const df = pl.DataFrame({"txt": ["Crab", "cat and dog", "rab$bit", null]})
30+
* df.select(
31+
* ... pl.col("txt"),
32+
* ... pl.col("txt").str.contains("cat|bit").alias("regex"),
33+
* ... pl.col("txt").str.contains("rab$", true).alias("literal"),
34+
* ... )
35+
* shape: (4, 3)
36+
* ┌─────────────┬───────┬─────────┐
37+
* │ txt ┆ regex ┆ literal │
38+
* │ --- ┆ --- ┆ --- │
39+
* │ str ┆ bool ┆ bool │
40+
* ╞═════════════╪═══════╪═════════╡
41+
* │ Crab ┆ false ┆ false │
42+
* │ cat and dog ┆ true ┆ false │
43+
* │ rab$bit ┆ true ┆ true │
44+
* │ null ┆ null ┆ null │
45+
* └─────────────┴───────┴─────────┘
46+
* ```
2547
*/
26-
contains(pattern: string | RegExp): Series;
48+
contains(
49+
pat: string | RegExp | Expr,
50+
literal?: boolean,
51+
strict?: boolean,
52+
): Series;
2753
/**
2854
* Decodes a value using the provided encoding
2955
* @param encoding - hex | base64
@@ -279,8 +305,8 @@ export const SeriesStringFunctions = (_s: any): StringNamespace => {
279305
.select(col(_s.name).str.concat(delimiter, ignoreNulls).as(_s.name))
280306
.getColumn(_s.name);
281307
},
282-
contains(pat: string | RegExp) {
283-
return wrap("strContains", regexToString(pat), false);
308+
contains(pat: string | RegExp | Expr, literal = false, strict = true) {
309+
return wrap("strContains", regexToString(pat as RegExp), literal, strict);
284310
},
285311
decode(arg, strict = false) {
286312
if (typeof arg === "string") {

polars/shared_traits.ts

+28-2
Original file line numberDiff line numberDiff line change
@@ -850,8 +850,34 @@ export interface StringFunctions<T> {
850850
* ```
851851
*/
852852
concat(delimiter: string, ignoreNulls?: boolean): T;
853-
/** Check if strings in Series contain regex pattern. */
854-
contains(pat: string | RegExp): T;
853+
/**
854+
* Check if strings in Series contain a substring that matches a pattern.
855+
* @param pat A valid regular expression pattern, compatible with the `regex crate
856+
* @param literal Treat `pattern` as a literal string, not as a regular expression.
857+
* @param strict Raise an error if the underlying pattern is not a valid regex, otherwise mask out with a null value.
858+
* @returns Boolean mask
859+
* @example
860+
* ```
861+
* const df = pl.DataFrame({"txt": ["Crab", "cat and dog", "rab$bit", null]})
862+
* df.select(
863+
* ... pl.col("txt"),
864+
* ... pl.col("txt").str.contains("cat|bit").alias("regex"),
865+
* ... pl.col("txt").str.contains("rab$", true).alias("literal"),
866+
* ... )
867+
* shape: (4, 3)
868+
* ┌─────────────┬───────┬─────────┐
869+
* │ txt ┆ regex ┆ literal │
870+
* │ --- ┆ --- ┆ --- │
871+
* │ str ┆ bool ┆ bool │
872+
* ╞═════════════╪═══════╪═════════╡
873+
* │ Crab ┆ false ┆ false │
874+
* │ cat and dog ┆ true ┆ false │
875+
* │ rab$bit ┆ true ┆ true │
876+
* │ null ┆ null ┆ null │
877+
* └─────────────┴───────┴─────────┘
878+
* ```
879+
*/
880+
contains(pat: string | RegExp | Expr, literal: boolean, strict: boolean): T;
855881
/**
856882
* Decodes a value using the provided encoding
857883
* @param encoding - hex | base64

src/lazy/dsl.rs

+15-13
Original file line numberDiff line numberDiff line change
@@ -834,19 +834,21 @@ impl JsExpr {
834834
}
835835

836836
#[napi(catch_unwind)]
837-
pub fn str_contains(&self, pat: String, strict: bool) -> JsExpr {
838-
let function = move |s: Column| {
839-
let ca = s.str()?;
840-
match ca.contains(&pat, strict) {
841-
Ok(ca) => Ok(Some(ca.into_column())),
842-
Err(e) => Err(PolarsError::ComputeError(format!("{:?}", e).into())),
843-
}
844-
};
845-
self.clone()
846-
.inner
847-
.map(function, GetOutput::from_type(DataType::Boolean))
848-
.with_fmt("str.contains")
849-
.into()
837+
pub fn str_contains(&self, pat: &JsExpr, literal: bool, strict: bool) -> JsExpr {
838+
match literal {
839+
true => self
840+
.inner
841+
.clone()
842+
.str()
843+
.contains_literal(pat.inner.clone())
844+
.into(),
845+
_ => self
846+
.inner
847+
.clone()
848+
.str()
849+
.contains(pat.inner.clone(), strict)
850+
.into(),
851+
}
850852
}
851853
#[napi(catch_unwind)]
852854
pub fn str_hex_encode(&self) -> JsExpr {

0 commit comments

Comments
 (0)