mirror of
https://github.com/mdbtools/mdbtools.git
synced 2025-06-28 15:39:02 +08:00
Postgres-style ILIKE operator (with Unicode support) (#244)
Access's `LIKE` is actually case-insensitive, but to prevent breaking existing programs that rely on mdbtools' case-sensitive behavior, introduce a new `ILIKE` operator to perform a case-insensitive match. Use GLib's `g_utf8_casefold` to make the comparison UTF-8 aware. A "poor man's" version is implemented in fakeglib, which relies on `towlower`, and won't work with multi-grapheme case transformations (e.g. German Eszett). Fixes #233
This commit is contained in:
parent
1b147b8d29
commit
a44a8ed8ae
@ -50,7 +50,7 @@ SQL LANGUAGE
|
|||||||
|
|
||||||
limit clause: LIMIT <integer>
|
limit clause: LIMIT <integer>
|
||||||
|
|
||||||
operator: =, =>, =<, <>, like, <, >
|
operator: =, =>, =<, <>, like, ilike, <, >
|
||||||
|
|
||||||
literal: integers, floating point numbers, or string literal in single quotes
|
literal: integers, floating point numbers, or string literal in single quotes
|
||||||
|
|
||||||
@ -63,6 +63,10 @@ NOTES
|
|||||||
|
|
||||||
The -i command can be passed the string 'stdin' to test entering text as if using a pipe.
|
The -i command can be passed the string 'stdin' to test entering text as if using a pipe.
|
||||||
|
|
||||||
|
The 'like' operator performs a case-sensitive pattern match, with ANSI-style wildcards. An underscore in the pattern will match any single character, and a percent sign will match any run of characters.
|
||||||
|
|
||||||
|
The 'ilike' operator is similar, but performs a case-insensitive pattern match.
|
||||||
|
|
||||||
ENVIRONMENT
|
ENVIRONMENT
|
||||||
LC_COLLATE Defines the locale for string-comparison operations. See locale(1).
|
LC_COLLATE Defines the locale for string-comparison operations. See locale(1).
|
||||||
MDB_JET3_CHARSET Defines the charset of the input JET3 (access 97) file. Default is CP1252. See iconv(1).
|
MDB_JET3_CHARSET Defines the charset of the input JET3 (access 97) file. Default is CP1252. See iconv(1).
|
||||||
|
@ -144,6 +144,7 @@ void g_printerr(const gchar *format, ...);
|
|||||||
gint g_unichar_to_utf8(gunichar c, gchar *dst);
|
gint g_unichar_to_utf8(gunichar c, gchar *dst);
|
||||||
gchar *g_locale_to_utf8(const gchar *opsysstring, size_t len,
|
gchar *g_locale_to_utf8(const gchar *opsysstring, size_t len,
|
||||||
size_t *bytes_read, size_t *bytes_written, GError **error);
|
size_t *bytes_read, size_t *bytes_written, GError **error);
|
||||||
|
gchar *g_utf8_casefold(const gchar *str, gssize len);
|
||||||
gchar *g_utf8_strdown(const gchar *str, gssize len);
|
gchar *g_utf8_strdown(const gchar *str, gssize len);
|
||||||
|
|
||||||
/* GString */
|
/* GString */
|
||||||
|
@ -129,7 +129,8 @@ enum {
|
|||||||
MDB_LTEQ,
|
MDB_LTEQ,
|
||||||
MDB_LIKE,
|
MDB_LIKE,
|
||||||
MDB_ISNULL,
|
MDB_ISNULL,
|
||||||
MDB_NOTNULL
|
MDB_NOTNULL,
|
||||||
|
MDB_ILIKE,
|
||||||
};
|
};
|
||||||
|
|
||||||
typedef enum {
|
typedef enum {
|
||||||
@ -164,6 +165,7 @@ enum {
|
|||||||
x == MDB_GTEQ || \
|
x == MDB_GTEQ || \
|
||||||
x == MDB_LTEQ || \
|
x == MDB_LTEQ || \
|
||||||
x == MDB_LIKE || \
|
x == MDB_LIKE || \
|
||||||
|
x == MDB_ILIKE || \
|
||||||
x == MDB_ISNULL || \
|
x == MDB_ISNULL || \
|
||||||
x == MDB_NOTNULL )
|
x == MDB_NOTNULL )
|
||||||
|
|
||||||
@ -611,6 +613,7 @@ void mdb_dump_stats(MdbHandle *mdb);
|
|||||||
|
|
||||||
/* like.c */
|
/* like.c */
|
||||||
int mdb_like_cmp(char *s, char *r);
|
int mdb_like_cmp(char *s, char *r);
|
||||||
|
int mdb_ilike_cmp(char *s, char *r);
|
||||||
|
|
||||||
/* write.c */
|
/* write.c */
|
||||||
void mdb_put_int16(void *buf, guint32 offset, guint32 value);
|
void mdb_put_int16(void *buf, guint32 offset, guint32 value);
|
||||||
|
@ -244,11 +244,16 @@ gchar *g_locale_to_utf8(const gchar *opsysstring, size_t len,
|
|||||||
size_t *bytes_read, size_t *bytes_written, GError **error) {
|
size_t *bytes_read, size_t *bytes_written, GError **error) {
|
||||||
if (len == (size_t)-1)
|
if (len == (size_t)-1)
|
||||||
len = strlen(opsysstring);
|
len = strlen(opsysstring);
|
||||||
wchar_t *utf16 = malloc(sizeof(wchar_t)*(len+1));
|
size_t wlen = mbstowcs(NULL, opsysstring, 0);
|
||||||
if (mbstowcs(utf16, opsysstring, len+1) == (size_t)-1) {
|
if (wlen == (size_t)-1) {
|
||||||
free(utf16);
|
if (error) {
|
||||||
return g_strndup(opsysstring, len);
|
*error = malloc(sizeof(GError));
|
||||||
|
(*error)->message = g_strdup_printf("Invalid multibyte string: %s\n", opsysstring);
|
||||||
}
|
}
|
||||||
|
return NULL;
|
||||||
|
}
|
||||||
|
wchar_t *utf16 = malloc(sizeof(wchar_t)*(wlen+1));
|
||||||
|
mbstowcs(utf16, opsysstring, wlen+1);
|
||||||
gchar *utf8 = malloc(3*len+1);
|
gchar *utf8 = malloc(3*len+1);
|
||||||
gchar *dst = utf8;
|
gchar *dst = utf8;
|
||||||
for (size_t i=0; i<len; i++) {
|
for (size_t i=0; i<len; i++) {
|
||||||
@ -260,6 +265,10 @@ gchar *g_locale_to_utf8(const gchar *opsysstring, size_t len,
|
|||||||
return utf8;
|
return utf8;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
gchar *g_utf8_casefold(const gchar *str, gssize len) {
|
||||||
|
return g_utf8_strdown(str, len);
|
||||||
|
}
|
||||||
|
|
||||||
gchar *g_utf8_strdown(const gchar *str, gssize len) {
|
gchar *g_utf8_strdown(const gchar *str, gssize len) {
|
||||||
gssize i = 0;
|
gssize i = 0;
|
||||||
if (len == -1)
|
if (len == -1)
|
||||||
@ -547,11 +556,10 @@ gboolean g_option_context_parse(GOptionContext *context,
|
|||||||
while ((c = getopt_long(*argc, *argv, short_opts, long_opts, &longindex)) != -1) {
|
while ((c = getopt_long(*argc, *argv, short_opts, long_opts, &longindex)) != -1) {
|
||||||
if (c == '?') {
|
if (c == '?') {
|
||||||
*error = malloc(sizeof(GError));
|
*error = malloc(sizeof(GError));
|
||||||
(*error)->message = malloc(100);
|
|
||||||
if (optopt) {
|
if (optopt) {
|
||||||
snprintf((*error)->message, 100, "Unrecognized option: -%c", optopt);
|
(*error)->message = g_strdup_printf("Unrecognized option: -%c", optopt);
|
||||||
} else {
|
} else {
|
||||||
snprintf((*error)->message, 100, "Unrecognized option: %s", (*argv)[optind-1]);
|
(*error)->message = g_strdup_printf("Unrecognized option: %s", (*argv)[optind-1]);
|
||||||
}
|
}
|
||||||
free(short_opts);
|
free(short_opts);
|
||||||
free(long_opts);
|
free(long_opts);
|
||||||
|
@ -1014,7 +1014,7 @@ int mdb_index_compute_cost(MdbTableDef *table, MdbIndex *idx)
|
|||||||
|
|
||||||
/*
|
/*
|
||||||
* a like with a wild card first is useless as a sarg */
|
* a like with a wild card first is useless as a sarg */
|
||||||
if (sarg->op == MDB_LIKE && sarg->value.s[0]=='%')
|
if ((sarg->op == MDB_LIKE || sarg->op == MDB_ILIKE) && sarg->value.s[0]=='%')
|
||||||
return 0;
|
return 0;
|
||||||
|
|
||||||
/*
|
/*
|
||||||
@ -1027,6 +1027,7 @@ int mdb_index_compute_cost(MdbTableDef *table, MdbIndex *idx)
|
|||||||
case MDB_EQUAL:
|
case MDB_EQUAL:
|
||||||
return 1; break;
|
return 1; break;
|
||||||
case MDB_LIKE:
|
case MDB_LIKE:
|
||||||
|
case MDB_ILIKE:
|
||||||
return 4; break;
|
return 4; break;
|
||||||
case MDB_ISNULL:
|
case MDB_ISNULL:
|
||||||
return 12; break;
|
return 12; break;
|
||||||
@ -1040,6 +1041,7 @@ int mdb_index_compute_cost(MdbTableDef *table, MdbIndex *idx)
|
|||||||
else return 1;
|
else return 1;
|
||||||
break;
|
break;
|
||||||
case MDB_LIKE:
|
case MDB_LIKE:
|
||||||
|
case MDB_ILIKE:
|
||||||
return 6; break;
|
return 6; break;
|
||||||
case MDB_ISNULL:
|
case MDB_ISNULL:
|
||||||
return 12; break;
|
return 12; break;
|
||||||
@ -1053,6 +1055,7 @@ int mdb_index_compute_cost(MdbTableDef *table, MdbIndex *idx)
|
|||||||
case MDB_EQUAL:
|
case MDB_EQUAL:
|
||||||
return 2; break;
|
return 2; break;
|
||||||
case MDB_LIKE:
|
case MDB_LIKE:
|
||||||
|
case MDB_ILIKE:
|
||||||
return 5; break;
|
return 5; break;
|
||||||
case MDB_ISNULL:
|
case MDB_ISNULL:
|
||||||
return 12; break;
|
return 12; break;
|
||||||
@ -1066,6 +1069,7 @@ int mdb_index_compute_cost(MdbTableDef *table, MdbIndex *idx)
|
|||||||
else return 2;
|
else return 2;
|
||||||
break;
|
break;
|
||||||
case MDB_LIKE:
|
case MDB_LIKE:
|
||||||
|
case MDB_ILIKE:
|
||||||
return 7; break;
|
return 7; break;
|
||||||
case MDB_ISNULL:
|
case MDB_ISNULL:
|
||||||
return 12; break;
|
return 12; break;
|
||||||
|
@ -39,11 +39,7 @@ int mdb_like_cmp(char *s, char *r)
|
|||||||
mdb_debug(MDB_DEBUG_LIKE, "comparing %s and %s", s, r);
|
mdb_debug(MDB_DEBUG_LIKE, "comparing %s and %s", s, r);
|
||||||
switch (r[0]) {
|
switch (r[0]) {
|
||||||
case '\0':
|
case '\0':
|
||||||
if (s[0]=='\0') {
|
return (s[0]=='\0');
|
||||||
return 1;
|
|
||||||
} else {
|
|
||||||
return 0;
|
|
||||||
}
|
|
||||||
case '_':
|
case '_':
|
||||||
/* skip one character */
|
/* skip one character */
|
||||||
return mdb_like_cmp(&s[1],&r[1]);
|
return mdb_like_cmp(&s[1],&r[1]);
|
||||||
@ -71,3 +67,25 @@ int mdb_like_cmp(char *s, char *r)
|
|||||||
}
|
}
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
/**
|
||||||
|
*
|
||||||
|
* @param s: String to search within.
|
||||||
|
* @param r: Case-insensitive search pattern.
|
||||||
|
*
|
||||||
|
* Tests the string @s to see if it matches the search pattern @r without
|
||||||
|
* regard to case; this mimics the behavior of the Access LIKE operator. In the
|
||||||
|
* search pattern, a percent sign indicates matching on any number of
|
||||||
|
* characters, and an underscore indicates matching any single character.
|
||||||
|
*
|
||||||
|
* @Returns: 1 if the string matches, 0 if the string does not match.
|
||||||
|
*/
|
||||||
|
int mdb_ilike_cmp(char *s, char *r) {
|
||||||
|
char *s1 = g_utf8_casefold(s, -1);
|
||||||
|
char *r1 = g_utf8_casefold(r, -1);
|
||||||
|
int result = mdb_like_cmp(s1, r1);
|
||||||
|
g_free(s1);
|
||||||
|
g_free(r1);
|
||||||
|
return result;
|
||||||
|
}
|
||||||
|
|
||||||
|
@ -47,6 +47,9 @@ int rc;
|
|||||||
if (node->op == MDB_LIKE) {
|
if (node->op == MDB_LIKE) {
|
||||||
return mdb_like_cmp(s,node->value.s);
|
return mdb_like_cmp(s,node->value.s);
|
||||||
}
|
}
|
||||||
|
if (node->op == MDB_ILIKE) {
|
||||||
|
return mdb_ilike_cmp(s,node->value.s);
|
||||||
|
}
|
||||||
rc = strcoll(node->value.s, s);
|
rc = strcoll(node->value.s, s);
|
||||||
switch (node->op) {
|
switch (node->op) {
|
||||||
case MDB_EQUAL:
|
case MDB_EQUAL:
|
||||||
|
@ -71,6 +71,7 @@ null { return NUL; }
|
|||||||
"<" { return LT; }
|
"<" { return LT; }
|
||||||
">" { return GT; }
|
">" { return GT; }
|
||||||
like { return LIKE; }
|
like { return LIKE; }
|
||||||
|
ilike { return ILIKE; }
|
||||||
limit { return LIMIT; }
|
limit { return LIMIT; }
|
||||||
top { return TOP; }
|
top { return TOP; }
|
||||||
percent { return PERCENT; }
|
percent { return PERCENT; }
|
||||||
|
@ -324,6 +324,9 @@ mdb_sql_dump_node(MdbSargNode *node, int level)
|
|||||||
case MDB_LIKE:
|
case MDB_LIKE:
|
||||||
printf(" like %s\n", node->value.s);
|
printf(" like %s\n", node->value.s);
|
||||||
break;
|
break;
|
||||||
|
case MDB_ILIKE:
|
||||||
|
printf(" ilike %s\n", node->value.s);
|
||||||
|
break;
|
||||||
case MDB_EQUAL:
|
case MDB_EQUAL:
|
||||||
printf(" = %d\n", node->value.i);
|
printf(" = %d\n", node->value.i);
|
||||||
break;
|
break;
|
||||||
@ -398,6 +401,7 @@ mdb_sql_eval_expr(MdbSQL *sql, char *const1, int op, char *const2)
|
|||||||
case MDB_LT: compar = (value < 0); break;
|
case MDB_LT: compar = (value < 0); break;
|
||||||
case MDB_LTEQ: compar = (value <= 0); break;
|
case MDB_LTEQ: compar = (value <= 0); break;
|
||||||
case MDB_LIKE: compar = mdb_like_cmp(const1,const2); break;
|
case MDB_LIKE: compar = mdb_like_cmp(const1,const2); break;
|
||||||
|
case MDB_ILIKE: compar = mdb_ilike_cmp(const1,const2); break;
|
||||||
default: illop = 1;
|
default: illop = 1;
|
||||||
}
|
}
|
||||||
} else if (const1[0]!='\'' && const2[0]!='\'') {
|
} else if (const1[0]!='\'' && const2[0]!='\'') {
|
||||||
|
@ -63,7 +63,7 @@ typedef struct sql_context
|
|||||||
%token <name> IDENT NAME PATH STRING NUMBER OPENING CLOSING
|
%token <name> IDENT NAME PATH STRING NUMBER OPENING CLOSING
|
||||||
%token SELECT FROM WHERE CONNECT DISCONNECT TO LIST TABLES AND OR NOT LIMIT COUNT STRPTIME
|
%token SELECT FROM WHERE CONNECT DISCONNECT TO LIST TABLES AND OR NOT LIMIT COUNT STRPTIME
|
||||||
%token DESCRIBE TABLE TOP PERCENT
|
%token DESCRIBE TABLE TOP PERCENT
|
||||||
%token LTEQ GTEQ LIKE IS NUL
|
%token LTEQ GTEQ LIKE ILIKE IS NUL
|
||||||
|
|
||||||
%type <name> database
|
%type <name> database
|
||||||
%type <name> constant
|
%type <name> constant
|
||||||
@ -81,7 +81,7 @@ typedef struct sql_context
|
|||||||
%left OR
|
%left OR
|
||||||
%left AND
|
%left AND
|
||||||
%right NOT
|
%right NOT
|
||||||
%left EQ LTEQ GTEQ LT GT LIKE IS
|
%left EQ LTEQ GTEQ LT GT LIKE ILIKE IS
|
||||||
|
|
||||||
%%
|
%%
|
||||||
|
|
||||||
@ -193,6 +193,7 @@ operator:
|
|||||||
| LTEQ { $$ = MDB_LTEQ; }
|
| LTEQ { $$ = MDB_LTEQ; }
|
||||||
| GTEQ { $$ = MDB_GTEQ; }
|
| GTEQ { $$ = MDB_GTEQ; }
|
||||||
| LIKE { $$ = MDB_LIKE; }
|
| LIKE { $$ = MDB_LIKE; }
|
||||||
|
| ILIKE { $$ = MDB_ILIKE; }
|
||||||
;
|
;
|
||||||
|
|
||||||
nulloperator:
|
nulloperator:
|
||||||
|
@ -413,7 +413,10 @@ main(int argc, char **argv)
|
|||||||
|
|
||||||
while (1) {
|
while (1) {
|
||||||
line ++;
|
line ++;
|
||||||
if (s) free(s);
|
if (s) {
|
||||||
|
free(s);
|
||||||
|
s = NULL;
|
||||||
|
}
|
||||||
|
|
||||||
if (in) {
|
if (in) {
|
||||||
s=calloc(bufsz, 1);
|
s=calloc(bufsz, 1);
|
||||||
@ -434,9 +437,13 @@ main(int argc, char **argv)
|
|||||||
s[strlen(s)-1]=0;
|
s[strlen(s)-1]=0;
|
||||||
} else {
|
} else {
|
||||||
snprintf(prompt, sizeof(prompt), "%d => ", line);
|
snprintf(prompt, sizeof(prompt), "%d => ", line);
|
||||||
s=readline(prompt);
|
locale = setlocale(LC_CTYPE, "");
|
||||||
if (!s)
|
char *l = readline(prompt);
|
||||||
|
setlocale(LC_CTYPE, locale);
|
||||||
|
if (!l)
|
||||||
break;
|
break;
|
||||||
|
s=g_locale_to_utf8(l, -1, NULL, NULL, NULL);
|
||||||
|
free(l);
|
||||||
}
|
}
|
||||||
|
|
||||||
if (!strcmp(s,"exit") || !strcmp(s,"quit") || !strcmp(s,"bye"))
|
if (!strcmp(s,"exit") || !strcmp(s,"quit") || !strcmp(s,"bye"))
|
||||||
|
Loading…
Reference in New Issue
Block a user