Better compressed text handling

This commit is contained in:
whydoubt
2004-12-11 06:07:20 +00:00
parent fa8d24dd2b
commit d271b5fae5
10 changed files with 192 additions and 89 deletions

View File

@@ -1,3 +1,14 @@
Sat Dec 11 00:03:17 CST 2004 Jeff Smith <whydoubt@yahoo.com>
* HACKING:
* include/mdbtools.h:
* src/libmdb/data.c:
* src/libmdb/iconv.c:
* src/libmdb/index.c:
* src/libmdb/sargs.c:
* src/libmdb/table.c:
* src/odbc/odbc.c:
* src/sql/mdbsql.c: Better compressed text handling
Wed Dec 1 18:18:02 EST 2004 Brian Bruns <brian@bruns.com>
* src/libmdb/data.c:
* src/libmdb/iconv.c:

24
HACKING
View File

@@ -697,3 +697,27 @@ Next comes one of more rows of data. (column names, descriptions, etc...)
See kkd.c for an example, although it needs cleanup.
Text Data Type
--------------
In Jet3, the encoding of text depends on the machine on which it was created.
So for databases created on U.S. English systems, it can be expected that text
is encoded in CP1252. This is the default used by mdbtools. If you know that
another encoding has been used, you can over-ride the default by setting the
environment variable MDB_JET3_CHARSET. To find out what encodings will work on
your system, run 'iconv -l'.
In Jet4, the encoding can be either little-endian UCS-2, or a special
compressed form of it. This compressed format begins with 0xff 0xfe.
The string then starts in compressed mode, where characters with 0x00 for the
most-significant byte do not encode it. In the compressed format, a 0x00 byte
signals a change from compressed mode to uncompressed mode, or from
uncompressed mode back to compressed mode. The string may end in either mode.
Note that a string containing any character 0x##00 (UCS-2) will not be
compressed. Also, the string will only be compressed if it really does make
the string shorter as compared to uncompressed UCS-2.
Programs that use mdbtools libraries will receive strings encoded in UTF-8 by
default. This default can by over-ridden by setting the environment variable
MDB_ICONV to the desired encoding.

View File

@@ -230,7 +230,6 @@ typedef struct {
#ifdef HAVE_ICONV
iconv_t iconv_in;
iconv_t iconv_out;
iconv_t iconv_compress;
#endif
} MdbHandle;
@@ -530,7 +529,7 @@ extern int mdb_get_option(unsigned long optnum);
extern void mdb_debug(int klass, char *fmt, ...);
/* iconv.c */
extern int mdb_unicode2ascii(MdbHandle *mdb, unsigned char *buf, int offset, unsigned int len, char *dest, unsigned int dest_sz);
extern int mdb_ascii2unicode(MdbHandle *mdb, unsigned char *buf, int offset, unsigned int len, char *dest, unsigned int dest_sz);
extern int mdb_unicode2ascii(MdbHandle *mdb, unsigned char *src, unsigned int slen, unsigned char *dest, unsigned int dlen);
extern int mdb_ascii2unicode(MdbHandle *mdb, unsigned char *src, unsigned int slen, unsigned char *dest, unsigned int dlen);
#endif /* _mdbtools_h_ */

View File

@@ -634,7 +634,7 @@ static char *mdb_memo_to_string(MdbHandle *mdb, int start, int size)
pg_row & 0xff, row_start, len);
buffer_dump(mdb->pg_buf, row_start, row_start + len);
#endif
mdb_unicode2ascii(mdb, buf, row_start, len, text, MDB_BIND_SIZE);
mdb_unicode2ascii(mdb, buf + row_start, len, text, MDB_BIND_SIZE);
return text;
} else { /* if (memo_flags == 0x0000) { */
pg_row = mdb_get_int32(mdb->pg_buf, start+4);
@@ -737,7 +737,7 @@ char *mdb_col_to_string(MdbHandle *mdb, unsigned char *buf, int start, int datat
if (size<0) {
return "";
}
mdb_unicode2ascii(mdb, mdb->pg_buf, start, size, text, MDB_BIND_SIZE);
mdb_unicode2ascii(mdb, mdb->pg_buf + start, size, text, MDB_BIND_SIZE);
return text;
break;
case MDB_SDATETIME:

View File

@@ -23,86 +23,159 @@
#include "dmalloc.h"
#endif
/*
* This function is used in reading text data from an MDB table.
*/
int
mdb_unicode2ascii(MdbHandle *mdb, unsigned char *buf, int offset, unsigned int len, char *dest, unsigned int dest_sz)
mdb_unicode2ascii(MdbHandle *mdb, unsigned char *src, unsigned int slen, unsigned char *dest, unsigned int dlen)
{
unsigned int i, ret;
int len_in, len_out;
unsigned char *in_ptr, *out_ptr;
unsigned char *tmp = NULL;
unsigned int tlen = 0;
int len_in, len_out;
char *in_ptr, *out_ptr;
in_ptr = &buf[offset];
out_ptr = dest;
len_in = len;
len_out = dest_sz;
if ((!src) || (!dest))
return 0;
/* Uncompress 'Unicode Compressed' string into tmp */
if (IS_JET4(mdb) && (slen>=2) && (src[0]==0xff) && (src[1]==0xfe)) {
unsigned int compress=1;
src += 2;
slen -= 2;
tmp = (char *)g_malloc(slen*2);
while (slen) {
if (*src == 0) {
compress = (compress) ? 0 : 1;
src++;
slen--;
} else if (compress) {
tmp[tlen++] = *src++;
tmp[tlen++] = 0;
slen--;
} else if (slen >= 2){
tmp[tlen++] = *src++;
tmp[tlen++] = *src++;
slen-=2;
}
}
}
if (buf[offset]==0xff && buf[offset+1]==0xfe) {
len_in -= 2;
in_ptr += 2;
ret = iconv(mdb->iconv_compress, (char **)&in_ptr, &len_in, (char **)&out_ptr, &len_out);
dest[dest_sz - len_out]='\0';
return dest_sz - len_out;
//strncpy(dest, in_ptr+2, len-2);
//dest[len-2]='\0';
in_ptr = (tmp) ? tmp : src;
out_ptr = dest;
len_in = (tmp) ? tlen : slen;
len_out = dlen;
#if HAVE_ICONV
//printf("1 len_in %d len_out %d\n",len_in, len_out);
while (1) {
iconv(mdb->iconv_in, &in_ptr, &len_in, &out_ptr, &len_out);
if (!len_in) break;
/* Don't bail if impossible conversion is encountered */
in_ptr += (IS_JET4(mdb)) ? 2 : 1;
len_in -= (IS_JET4(mdb)) ? 2 : 1;
*out_ptr++ = '?';
len_out--;
}
//printf("2 len_in %d len_out %d\n",len_in, len_out);
dlen -= len_out;
#else
if (IS_JET3(mdb)) {
strncpy(out_ptr, in_ptr, len_in);
dlen = len_in;
} else {
#ifdef HAVE_ICONV
if (mdb->iconv_in) {
//printf("1 len_in %d len_out %d\n",len_in, len_out);
ret = iconv(mdb->iconv_in, (char **)&in_ptr, &len_in, (char **)&out_ptr, &len_out);
//printf("2 len_in %d len_out %d\n",len_in, len_out);
dest[dest_sz - len_out]='\0';
//printf("dest %s\n",dest);
return dest_sz - len_out;
}
/* rough UCS-2LE to ISO-8859-1 conversion */
unsigned int i;
for (i=0; i<len_in; i+=2)
dest[i/2] = (in_ptr[i+1] == 0) ? in_ptr[i] : '?';
dlen = len_in/2;
}
#endif
/* convert unicode to ascii, rather sloppily */
for (i=0;i<len;i+=2)
dest[i/2] = in_ptr[i];
dest[len/2]='\0';
}
return len;
if (tmp) g_free(tmp);
dest[dlen]='\0';
//printf("dest %s\n",dest);
return dlen;
}
/*
* This function is used in writing text data to an MDB table.
* If slen is 0, strlen will be used to calculate src's length.
*/
int
mdb_ascii2unicode(MdbHandle *mdb, unsigned char *buf, int offset, unsigned int len, char *dest, unsigned int dest_sz)
mdb_ascii2unicode(MdbHandle *mdb, unsigned char *src, unsigned int slen, unsigned char *dest, unsigned int dlen)
{
unsigned int i = 0, ret;
size_t len_in, len_out, len_orig;
size_t len_in, len_out;
char *in_ptr, *out_ptr;
in_ptr = &buf[offset];
out_ptr = dest;
len_orig = strlen(in_ptr);
len_in = len_orig;
len_out = dest_sz;
if ((!src) || (!dest))
return 0;
if (!buf) return 0;
in_ptr = src;
out_ptr = dest;
len_in = (slen) ? slen : strlen(src);
len_out = dlen;
#ifdef HAVE_ICONV
if (mdb->iconv_out) {
ret = iconv(mdb->iconv_out, &in_ptr, &len_in, &out_ptr, &len_out);
//printf("len_in %d len_out %d\n",len_in, len_out);
dest[dest_sz - len_out]='\0';
dest[dest_sz - len_out + 1]='\0';
return dest_sz - len_out;
iconv(mdb->iconv_out, &in_ptr, &len_in, &out_ptr, &len_out);
//printf("len_in %d len_out %d\n", len_in, len_out);
dlen -= len_out;
#else
if (IS_JET3(mdb)) {
dlen = MIN(len_in, len_out);
strncpy(out_ptr, in_ptr, dlen);
} else {
unsigned int i;
slen = MIN(len_in, len_out/2);
dlen = slen*2;
for (i=0; i<slen; i++) {
out_ptr[i*2] = in_ptr[i];
out_ptr[i*2+1] = 0;
}
}
#endif
if (IS_JET3(mdb)) {
strncpy(dest, in_ptr, len);
dest[len]='\0';
return strlen(dest);
/* Unicode Compression */
if(IS_JET4(mdb) && (dlen>4)) {
char *tmp = g_malloc(dlen);
int tptr = 0, dptr = 0;
int comp = 1;
tmp[tptr++] = 0xff;
tmp[tptr++] = 0xfe;
while((dptr < dlen) && (tptr < dlen)) {
if (((dest[dptr+1]==0) && (comp==0))
|| ((dest[dptr+1]!=0) && (comp==1))) {
/* switch encoding mode */
tmp[tptr++] = 0;
comp = (comp) ? 0 : 1;
} else if (dest[dptr]==0) {
/* this string cannot be compressed */
tptr = dlen;
} else if (comp==1) {
/* encode compressed character */
tmp[tptr++] = dest[dptr];
dptr += 2;
} else if (tptr+1 < dlen) {
/* encode uncompressed character */
tmp[tptr++] = dest[dptr];
tmp[tptr++] = dest[dptr+1];
dptr += 2;
} else {
/* could not encode uncompressed character
* into single byte */
tptr = dlen;
}
}
if (tptr < dlen) {
memcpy(dest, tmp, tptr);
dlen = tptr;
}
g_free(tmp);
}
while (i<strlen(in_ptr) && (i*2+2)<len) {
dest[i*2] = in_ptr[i];
dest[i*2+1] = 0;
i++;
}
return (i*2);
return dlen;
}
void mdb_iconv_init(MdbHandle *mdb)
{
char *iconv_code;
@@ -112,21 +185,20 @@ void mdb_iconv_init(MdbHandle *mdb)
iconv_code="UTF-8";
}
#ifdef HAVE_ICONV
if (IS_JET4(mdb)) {
mdb->iconv_out = iconv_open("UCS-2LE", iconv_code);
mdb->iconv_in = iconv_open(iconv_code, "UCS-2LE");
mdb->iconv_compress = iconv_open(iconv_code, "ISO8859-1");
} else {
/* ToDO - need to determine character set from file */
/* But according to MS kb289525 and kb202427, there is not such info in jet3 db */
/* According to Microsoft Knowledge Base pages 289525 and */
/* 202427, code page info is not contained in the database */
char *jet3_iconv_code;
/* check environment variable */
if (!(jet3_iconv_code=(char *)getenv("MDB_JET3_CHARSET"))) {
jet3_iconv_code="ISO8859-1";
jet3_iconv_code="CP1252";
}
mdb->iconv_out = iconv_open(jet3_iconv_code, iconv_code);
mdb->iconv_in = iconv_open(iconv_code, jet3_iconv_code);
}
@@ -137,8 +209,5 @@ void mdb_iconv_close(MdbHandle *mdb)
#ifdef HAVE_ICONV
if (mdb->iconv_out != (iconv_t)-1) iconv_close(mdb->iconv_out);
if (mdb->iconv_in != (iconv_t)-1) iconv_close(mdb->iconv_in);
if (IS_JET4(mdb)) {
if (mdb->iconv_compress != (iconv_t)-1) iconv_close(mdb->iconv_compress);
}
#endif
}

View File

@@ -111,7 +111,7 @@ mdb_read_indices(MdbTableDef *table)
tmpbuf = g_malloc(name_sz);
read_pg_if_n(mdb, tmpbuf, &cur_pos, name_sz);
cur_pos += name_sz;
mdb_unicode2ascii(mdb, tmpbuf, 0, name_sz, pidx->name, name_sz);
mdb_unicode2ascii(mdb, tmpbuf, name_sz, pidx->name, MDB_MAX_OBJ_NAME);
g_free(tmpbuf);
//fprintf(stderr, "index name %s\n", pidx->name);
}

View File

@@ -154,7 +154,7 @@ int lastchar;
return mdb_test_int(node, (gint32)mdb_get_int32(field->value, 0));
break;
case MDB_TEXT:
mdb_unicode2ascii(mdb, field->value, 0, field->siz, tmpbuf, 256);
mdb_unicode2ascii(mdb, field->value, field->siz, tmpbuf, 256);
return mdb_test_string(node, tmpbuf);
default:
fprintf(stderr, "Calling mdb_test_sarg on unknown type. Add code to mdb_test_sarg() for type %d\n",col->col_type);

View File

@@ -269,9 +269,9 @@ GPtrArray *mdb_read_columns(MdbTableDef *table)
** column names - ordered the same as the column attributes table
*/
for (i=0;i<table->num_cols;i++) {
char *tmp_buf;
pcol = g_ptr_array_index(table->columns, i);
char *tmp_buf;
if (IS_JET4(mdb)) {
name_sz = read_pg_if_16(mdb, &cur_pos);
cur_pos += 2;
@@ -285,7 +285,7 @@ GPtrArray *mdb_read_columns(MdbTableDef *table)
}
tmp_buf = (char *) g_malloc(name_sz);
read_pg_if_n(mdb, tmp_buf, &cur_pos, name_sz);
mdb_unicode2ascii(mdb, tmp_buf, 0, name_sz, pcol->name, name_sz);
mdb_unicode2ascii(mdb, tmp_buf, name_sz, pcol->name, MDB_MAX_OBJ_NAME);
g_free(tmp_buf);
cur_pos += name_sz;

View File

@@ -27,7 +27,7 @@
#include "connectparams.h"
static char software_version[] = "$Id: odbc.c,v 1.26 2004/11/27 18:18:55 whydoubt Exp $";
static char software_version[] = "$Id: odbc.c,v 1.27 2004/12/11 06:07:22 whydoubt Exp $";
static void *no_unused_var_warn[] = {software_version,
no_unused_var_warn};
@@ -1081,9 +1081,9 @@ SQLRETURN SQL_API SQLColumns(
for (j=0; j<table->num_cols; j++) {
col = g_ptr_array_index(table->columns, j);
ts2 = mdb_ascii2unicode(mdb, table->name, 0, 100, t2, MDB_BIND_SIZE);
ts3 = mdb_ascii2unicode(mdb, col->name, 0, 100, t3, MDB_BIND_SIZE);
ts5 = mdb_ascii2unicode(mdb, "FIX ME", 0, 100, t5, MDB_BIND_SIZE);
ts2 = mdb_ascii2unicode(mdb, table->name, 0, t2, MDB_BIND_SIZE);
ts3 = mdb_ascii2unicode(mdb, col->name, 0, t3, MDB_BIND_SIZE);
ts5 = mdb_ascii2unicode(mdb, "FIX ME", 0, t5, MDB_BIND_SIZE);
nullable = SQL_NO_NULLS;
datatype = _odbc_get_client_type(col->col_type);
sqldatatype = _odbc_get_client_type(col->col_type);
@@ -1384,11 +1384,11 @@ SQLRETURN SQL_API SQLGetTypeInfo(
if (fSqlType && (fSqlType != type_info[i].data_type))
continue;
ts0 = mdb_ascii2unicode(mdb, type_info[i].type_name, 0, 100, t0, MDB_BIND_SIZE);
ts3 = mdb_ascii2unicode(mdb, type_info[i].literal_prefix, 0, 100, t3, MDB_BIND_SIZE);
ts4 = mdb_ascii2unicode(mdb, type_info[i].literal_suffix, 0, 100, t4, MDB_BIND_SIZE);
ts5 = mdb_ascii2unicode(mdb, type_info[i].create_params, 0, 100, t5, MDB_BIND_SIZE);
ts12 = mdb_ascii2unicode(mdb, type_info[i].local_type_name, 0, 100, t12, MDB_BIND_SIZE);
ts0 = mdb_ascii2unicode(mdb, type_info[i].type_name, 0, t0, MDB_BIND_SIZE);
ts3 = mdb_ascii2unicode(mdb, type_info[i].literal_prefix, 0, t3, MDB_BIND_SIZE);
ts4 = mdb_ascii2unicode(mdb, type_info[i].literal_suffix, 0, t4, MDB_BIND_SIZE);
ts5 = mdb_ascii2unicode(mdb, type_info[i].create_params, 0, t5, MDB_BIND_SIZE);
ts12 = mdb_ascii2unicode(mdb, type_info[i].local_type_name, 0, t12, MDB_BIND_SIZE);
FILL_FIELD(&fields[0], t0, ts0);
FILL_FIELD(&fields[1],&type_info[i].data_type, 0);
@@ -1542,8 +1542,8 @@ SQLRETURN SQL_API SQLTables(
FILL_FIELD(&fields[j], NULL, 0);
}
ts2 = mdb_ascii2unicode(mdb, entry->object_name, 0, 100, t2, MDB_BIND_SIZE);
ts3 = mdb_ascii2unicode(mdb, table_types[ttype], 0, 100, t3, MDB_BIND_SIZE);
ts2 = mdb_ascii2unicode(mdb, entry->object_name, 0, t2, MDB_BIND_SIZE);
ts3 = mdb_ascii2unicode(mdb, table_types[ttype], 0, t3, MDB_BIND_SIZE);
FILL_FIELD(&fields[2], t2, ts2);
FILL_FIELD(&fields[3], t3, ts3);

View File

@@ -540,7 +540,7 @@ void mdb_sql_listtables(MdbSQL *sql)
entry = g_ptr_array_index (mdb->catalog, i);
if (mdb_is_user_table(entry)) {
//col = g_ptr_array_index(table->columns,0);
tmpsiz = mdb_ascii2unicode(mdb, entry->object_name, 0, strlen(entry->object_name), tmpstr, 100);
tmpsiz = mdb_ascii2unicode(mdb, entry->object_name, 0, tmpstr, 100);
mdb_fill_temp_field(&fields[0],tmpstr, tmpsiz, 0,0,0,0);
row_size = mdb_pack_row(ttable, row_buffer, 1, fields);
mdb_add_row_to_pg(ttable,row_buffer, row_size);
@@ -605,15 +605,15 @@ void mdb_sql_describe_table(MdbSQL *sql)
for (i=0;i<table->num_cols;i++) {
col = g_ptr_array_index(table->columns,i);
tmpsiz = mdb_ascii2unicode(mdb, col->name, 0, strlen(col->name), col_name, 100);
tmpsiz = mdb_ascii2unicode(mdb, col->name, 0, col_name, 100);
mdb_fill_temp_field(&fields[0],col_name, tmpsiz, 0,0,0,0);
strcpy(tmpstr, mdb_get_coltype_string(mdb->default_backend, col->col_type));
tmpsiz = mdb_ascii2unicode(mdb, tmpstr, 0, strlen(col->name), col_type, 100);
tmpsiz = mdb_ascii2unicode(mdb, tmpstr, 0, col_type, 100);
mdb_fill_temp_field(&fields[1],col_type, tmpsiz, 0,0,0,1);
sprintf(tmpstr,"%d",col->col_size);
tmpsiz = mdb_ascii2unicode(mdb, tmpstr, 0, strlen(tmpstr), col_size, 100);
tmpsiz = mdb_ascii2unicode(mdb, tmpstr, 0, col_size, 100);
mdb_fill_temp_field(&fields[2],col_size, tmpsiz, 0,0,0,2);
row_size = mdb_pack_row(ttable, row_buffer, 3, fields);