Skip to content

Commit a883c31

Browse files
committed
Merge branch 'nd/icase'
"git grep -i" has been taught to fold case in non-ascii locales correctly. * nd/icase: grep.c: reuse "icase" variable diffcore-pickaxe: support case insensitive match on non-ascii diffcore-pickaxe: Add regcomp_or_die() grep/pcre: support utf-8 gettext: add is_utf8_locale() grep/pcre: prepare locale-dependent tables for icase matching grep: rewrite an if/else condition to avoid duplicate expression grep/icase: avoid kwsset when -F is specified grep/icase: avoid kwsset on literal non-ascii strings test-regex: expose full regcomp() to the command line test-regex: isolate the bug test code grep: break down an "if" stmt in preparation for next changes
2 parents a63d31b + 695f95b commit a883c31

11 files changed

+291
-21
lines changed

diffcore-pickaxe.c

+24-9
Original file line numberDiff line numberDiff line change
@@ -7,6 +7,8 @@
77
#include "diffcore.h"
88
#include "xdiff-interface.h"
99
#include "kwset.h"
10+
#include "commit.h"
11+
#include "quote.h"
1012

1113
typedef int (*pickaxe_fn)(mmfile_t *one, mmfile_t *two,
1214
struct diff_options *o,
@@ -198,6 +200,18 @@ static void pickaxe(struct diff_queue_struct *q, struct diff_options *o,
198200
*q = outq;
199201
}
200202

203+
static void regcomp_or_die(regex_t *regex, const char *needle, int cflags)
204+
{
205+
int err = regcomp(regex, needle, cflags);
206+
if (err) {
207+
/* The POSIX.2 people are surely sick */
208+
char errbuf[1024];
209+
regerror(err, regex, errbuf, 1024);
210+
regfree(regex);
211+
die("invalid regex: %s", errbuf);
212+
}
213+
}
214+
201215
void diffcore_pickaxe(struct diff_options *o)
202216
{
203217
const char *needle = o->pickaxe;
@@ -206,18 +220,19 @@ void diffcore_pickaxe(struct diff_options *o)
206220
kwset_t kws = NULL;
207221

208222
if (opts & (DIFF_PICKAXE_REGEX | DIFF_PICKAXE_KIND_G)) {
209-
int err;
210223
int cflags = REG_EXTENDED | REG_NEWLINE;
211224
if (DIFF_OPT_TST(o, PICKAXE_IGNORE_CASE))
212225
cflags |= REG_ICASE;
213-
err = regcomp(&regex, needle, cflags);
214-
if (err) {
215-
/* The POSIX.2 people are surely sick */
216-
char errbuf[1024];
217-
regerror(err, &regex, errbuf, 1024);
218-
regfree(&regex);
219-
die("invalid regex: %s", errbuf);
220-
}
226+
regcomp_or_die(&regex, needle, cflags);
227+
regexp = &regex;
228+
} else if (DIFF_OPT_TST(o, PICKAXE_IGNORE_CASE) &&
229+
has_non_ascii(needle)) {
230+
struct strbuf sb = STRBUF_INIT;
231+
int cflags = REG_NEWLINE | REG_ICASE;
232+
233+
basic_regex_quote_buf(&sb, needle);
234+
regcomp_or_die(&regex, sb.buf, cflags);
235+
strbuf_release(&sb);
221236
regexp = &regex;
222237
} else {
223238
kws = kwsalloc(DIFF_OPT_TST(o, PICKAXE_IGNORE_CASE)

gettext.c

+22-2
Original file line numberDiff line numberDiff line change
@@ -18,6 +18,8 @@
1818
# endif
1919
#endif
2020

21+
static const char *charset;
22+
2123
/*
2224
* Guess the user's preferred languages from the value in LANGUAGE environment
2325
* variable and LC_MESSAGES locale category if NO_GETTEXT is not defined.
@@ -65,7 +67,6 @@ static int test_vsnprintf(const char *fmt, ...)
6567
return ret;
6668
}
6769

68-
static const char *charset;
6970
static void init_gettext_charset(const char *domain)
7071
{
7172
/*
@@ -172,8 +173,27 @@ int gettext_width(const char *s)
172173
{
173174
static int is_utf8 = -1;
174175
if (is_utf8 == -1)
175-
is_utf8 = !strcmp(charset, "UTF-8");
176+
is_utf8 = is_utf8_locale();
176177

177178
return is_utf8 ? utf8_strwidth(s) : strlen(s);
178179
}
179180
#endif
181+
182+
int is_utf8_locale(void)
183+
{
184+
#ifdef NO_GETTEXT
185+
if (!charset) {
186+
const char *env = getenv("LC_ALL");
187+
if (!env || !*env)
188+
env = getenv("LC_CTYPE");
189+
if (!env || !*env)
190+
env = getenv("LANG");
191+
if (!env)
192+
env = "";
193+
if (strchr(env, '.'))
194+
env = strchr(env, '.') + 1;
195+
charset = xstrdup(env);
196+
}
197+
#endif
198+
return is_encoding_utf8(charset);
199+
}

gettext.h

+1
Original file line numberDiff line numberDiff line change
@@ -90,5 +90,6 @@ const char *Q_(const char *msgid, const char *plu, unsigned long n)
9090
#endif
9191

9292
const char *get_preferred_languages(void);
93+
extern int is_utf8_locale(void);
9394

9495
#endif

grep.c

+57-7
Original file line numberDiff line numberDiff line change
@@ -4,6 +4,8 @@
44
#include "xdiff-interface.h"
55
#include "diff.h"
66
#include "diffcore.h"
7+
#include "commit.h"
8+
#include "quote.h"
79

810
static int grep_source_load(struct grep_source *gs);
911
static int grep_source_is_binary(struct grep_source *gs);
@@ -322,11 +324,16 @@ static void compile_pcre_regexp(struct grep_pat *p, const struct grep_opt *opt)
322324
int erroffset;
323325
int options = PCRE_MULTILINE;
324326

325-
if (opt->ignore_case)
327+
if (opt->ignore_case) {
328+
if (has_non_ascii(p->pattern))
329+
p->pcre_tables = pcre_maketables();
326330
options |= PCRE_CASELESS;
331+
}
332+
if (is_utf8_locale() && has_non_ascii(p->pattern))
333+
options |= PCRE_UTF8;
327334

328335
p->pcre_regexp = pcre_compile(p->pattern, options, &error, &erroffset,
329-
NULL);
336+
p->pcre_tables);
330337
if (!p->pcre_regexp)
331338
compile_regexp_failed(p, error);
332339

@@ -360,6 +367,7 @@ static void free_pcre_regexp(struct grep_pat *p)
360367
{
361368
pcre_free(p->pcre_regexp);
362369
pcre_free(p->pcre_extra_info);
370+
pcre_free((void *)p->pcre_tables);
363371
}
364372
#else /* !USE_LIBPCRE */
365373
static void compile_pcre_regexp(struct grep_pat *p, const struct grep_opt *opt)
@@ -396,26 +404,68 @@ static int is_fixed(const char *s, size_t len)
396404
return 1;
397405
}
398406

407+
static void compile_fixed_regexp(struct grep_pat *p, struct grep_opt *opt)
408+
{
409+
struct strbuf sb = STRBUF_INIT;
410+
int err;
411+
int regflags;
412+
413+
basic_regex_quote_buf(&sb, p->pattern);
414+
regflags = opt->regflags & ~REG_EXTENDED;
415+
if (opt->ignore_case)
416+
regflags |= REG_ICASE;
417+
err = regcomp(&p->regexp, sb.buf, regflags);
418+
if (opt->debug)
419+
fprintf(stderr, "fixed %s\n", sb.buf);
420+
strbuf_release(&sb);
421+
if (err) {
422+
char errbuf[1024];
423+
regerror(err, &p->regexp, errbuf, sizeof(errbuf));
424+
regfree(&p->regexp);
425+
compile_regexp_failed(p, errbuf);
426+
}
427+
}
428+
399429
static void compile_regexp(struct grep_pat *p, struct grep_opt *opt)
400430
{
431+
int icase, ascii_only;
401432
int err;
402433

403434
p->word_regexp = opt->word_regexp;
404435
p->ignore_case = opt->ignore_case;
436+
icase = opt->regflags & REG_ICASE || p->ignore_case;
437+
ascii_only = !has_non_ascii(p->pattern);
405438

439+
/*
440+
* Even when -F (fixed) asks us to do a non-regexp search, we
441+
* may not be able to correctly case-fold when -i
442+
* (ignore-case) is asked (in which case, we'll synthesize a
443+
* regexp to match the pattern that matches regexp special
444+
* characters literally, while ignoring case differences). On
445+
* the other hand, even without -F, if the pattern does not
446+
* have any regexp special characters and there is no need for
447+
* case-folding search, we can internally turn it into a
448+
* simple string match using kws. p->fixed tells us if we
449+
* want to use kws.
450+
*/
406451
if (opt->fixed || is_fixed(p->pattern, p->patternlen))
407-
p->fixed = 1;
452+
p->fixed = !icase || ascii_only;
408453
else
409454
p->fixed = 0;
410455

411456
if (p->fixed) {
412-
if (opt->regflags & REG_ICASE || p->ignore_case)
413-
p->kws = kwsalloc(tolower_trans_tbl);
414-
else
415-
p->kws = kwsalloc(NULL);
457+
p->kws = kwsalloc(icase ? tolower_trans_tbl : NULL);
416458
kwsincr(p->kws, p->pattern, p->patternlen);
417459
kwsprep(p->kws);
418460
return;
461+
} else if (opt->fixed) {
462+
/*
463+
* We come here when the pattern has the non-ascii
464+
* characters we cannot case-fold, and asked to
465+
* ignore-case.
466+
*/
467+
compile_fixed_regexp(p, opt);
468+
return;
419469
}
420470

421471
if (opt->pcre) {

grep.h

+1
Original file line numberDiff line numberDiff line change
@@ -48,6 +48,7 @@ struct grep_pat {
4848
regex_t regexp;
4949
pcre *pcre_regexp;
5050
pcre_extra *pcre_extra_info;
51+
const unsigned char *pcre_tables;
5152
kwset_t kws;
5253
unsigned fixed:1;
5354
unsigned ignore_case:1;

quote.c

+37
Original file line numberDiff line numberDiff line change
@@ -453,3 +453,40 @@ void tcl_quote_buf(struct strbuf *sb, const char *src)
453453
}
454454
strbuf_addch(sb, '"');
455455
}
456+
457+
void basic_regex_quote_buf(struct strbuf *sb, const char *src)
458+
{
459+
char c;
460+
461+
if (*src == '^') {
462+
/* only beginning '^' is special and needs quoting */
463+
strbuf_addch(sb, '\\');
464+
strbuf_addch(sb, *src++);
465+
}
466+
if (*src == '*')
467+
/* beginning '*' is not special, no quoting */
468+
strbuf_addch(sb, *src++);
469+
470+
while ((c = *src++)) {
471+
switch (c) {
472+
case '[':
473+
case '.':
474+
case '\\':
475+
case '*':
476+
strbuf_addch(sb, '\\');
477+
strbuf_addch(sb, c);
478+
break;
479+
480+
case '$':
481+
/* only the end '$' is special and needs quoting */
482+
if (*src == '\0')
483+
strbuf_addch(sb, '\\');
484+
strbuf_addch(sb, c);
485+
break;
486+
487+
default:
488+
strbuf_addch(sb, c);
489+
break;
490+
}
491+
}
492+
}

quote.h

+1
Original file line numberDiff line numberDiff line change
@@ -70,5 +70,6 @@ extern char *quote_path_relative(const char *in, const char *prefix,
7070
extern void perl_quote_buf(struct strbuf *sb, const char *src);
7171
extern void python_quote_buf(struct strbuf *sb, const char *src);
7272
extern void tcl_quote_buf(struct strbuf *sb, const char *src);
73+
extern void basic_regex_quote_buf(struct strbuf *sb, const char *src);
7374

7475
#endif

t/helper/test-regex.c

+57-2
Original file line numberDiff line numberDiff line change
@@ -1,6 +1,23 @@
11
#include "git-compat-util.h"
2+
#include "gettext.h"
23

3-
int main(int argc, char **argv)
4+
struct reg_flag {
5+
const char *name;
6+
int flag;
7+
};
8+
9+
static struct reg_flag reg_flags[] = {
10+
{ "EXTENDED", REG_EXTENDED },
11+
{ "NEWLINE", REG_NEWLINE },
12+
{ "ICASE", REG_ICASE },
13+
{ "NOTBOL", REG_NOTBOL },
14+
#ifdef REG_STARTEND
15+
{ "STARTEND", REG_STARTEND },
16+
#endif
17+
{ NULL, 0 }
18+
};
19+
20+
static int test_regex_bug(void)
421
{
522
char *pat = "[^={} \t]+";
623
char *str = "={}\nfred";
@@ -16,5 +33,43 @@ int main(int argc, char **argv)
1633
if (m[0].rm_so == 3) /* matches '\n' when it should not */
1734
die("regex bug confirmed: re-build git with NO_REGEX=1");
1835

19-
exit(0);
36+
return 0;
37+
}
38+
39+
int main(int argc, char **argv)
40+
{
41+
const char *pat;
42+
const char *str;
43+
int flags = 0;
44+
regex_t r;
45+
regmatch_t m[1];
46+
47+
if (argc == 2 && !strcmp(argv[1], "--bug"))
48+
return test_regex_bug();
49+
else if (argc < 3)
50+
usage("test-regex --bug\n"
51+
"test-regex <pattern> <string> [<options>]");
52+
53+
argv++;
54+
pat = *argv++;
55+
str = *argv++;
56+
while (*argv) {
57+
struct reg_flag *rf;
58+
for (rf = reg_flags; rf->name; rf++)
59+
if (!strcmp(*argv, rf->name)) {
60+
flags |= rf->flag;
61+
break;
62+
}
63+
if (!rf->name)
64+
die("do not recognize %s", *argv);
65+
argv++;
66+
}
67+
git_setup_gettext();
68+
69+
if (regcomp(&r, pat, flags))
70+
die("failed regcomp() for pattern '%s'", pat);
71+
if (regexec(&r, str, 1, m, 0))
72+
return 1;
73+
74+
return 0;
2075
}

t/t0070-fundamental.sh

+1-1
Original file line numberDiff line numberDiff line change
@@ -31,7 +31,7 @@ test_expect_success 'git_mkstemps_mode does not fail if fd 0 is not open' '
3131

3232
test_expect_success 'check for a bug in the regex routines' '
3333
# if this test fails, re-build git with NO_REGEX=1
34-
test-regex
34+
test-regex --bug
3535
'
3636

3737
test_done

0 commit comments

Comments
 (0)