From 134306d1af0973e7fc55a67c811aba12f3b7d105 Mon Sep 17 00:00:00 2001 From: Evan Miller Date: Sun, 20 Dec 2020 18:02:23 -0500 Subject: [PATCH 1/2] Simplify header-reading logic and support JET3 code pages Using the notes and RC4 key provided in the HACKING file, decrypt the database definition page all at once instead of decrypting individual fields with ad-hoc keys. Use the newly decrypted header to access the database code page at offset 0x3C, and use this numeric value to initialize the iconv converter with an appropriate charset name for popular windows code pages. More encodings can be added later, with the eventual goal of getting rid of the MDB_JET3_CHARSET environment variable. Note that individual columns can have their own code pages but this issue is not addressed. An extra field is added to the MdbFile structure - because this struct is allocated internally, this should not break the public ABI. Finally, only set the db_passwd field if it's a JET3 database (see #144) --- include/mdbtools.h | 1 + src/libmdb/file.c | 32 ++++++++++++-------------------- src/libmdb/iconv.c | 29 +++++++++++++++++++++++------ 3 files changed, 36 insertions(+), 26 deletions(-) diff --git a/include/mdbtools.h b/include/mdbtools.h index 8c19783..5b93cfb 100644 --- a/include/mdbtools.h +++ b/include/mdbtools.h @@ -246,6 +246,7 @@ typedef struct { unsigned char *free_map; /* reference count */ int refs; + guint16 code_page; } MdbFile; /* offset to row count on data pages...version dependant */ diff --git a/src/libmdb/file.c b/src/libmdb/file.c index c60c703..28b536f 100644 --- a/src/libmdb/file.c +++ b/src/libmdb/file.c @@ -168,9 +168,6 @@ static char *mdb_find_file(const char *file_name) * Return value: The handle on success, NULL on failure */ static MdbHandle *mdb_handle_from_stream(FILE *stream, MdbFileFlags flags) { - int key[] = {0x86, 0xfb, 0xec, 0x37, 0x5d, 0x44, 0x9c, 0xfa, 0xc6, 0x5e, 0x28, 0xe6, 0x13, 0xb6}; - int j, pos; - MdbHandle *mdb = (MdbHandle *) g_malloc0(sizeof(MdbHandle)); mdb_set_default_backend(mdb, "access"); mdb_set_date_fmt(mdb, "%x %X"); @@ -216,27 +213,22 @@ static MdbHandle *mdb_handle_from_stream(FILE *stream, MdbFileFlags flags) { mdb_close(mdb); return NULL; } + + RC4_KEY rc4_key; + unsigned int tmp_key = 0x6b39dac7; + RC4_set_key(&rc4_key, 4, (unsigned char *)&tmp_key); + RC4(&rc4_key, mdb->f->jet_version == MDB_VER_JET3 ? 126 : 128, mdb->pg_buf + 0x18); + + mdb->f->code_page = mdb_get_int16(mdb->pg_buf, 0x3c); mdb->f->db_key = mdb_get_int32(mdb->pg_buf, 0x3e); - /* I don't know if this value is valid for some versions? - * it doesn't seem to be valid for the databases I have - * - * f->db_key ^= 0xe15e01b9; - */ - mdb->f->db_key ^= 0x4ebc8afb; - /* fprintf(stderr, "Encrypted file, RC4 key seed= %d\n", mdb->f->db_key); */ + if (mdb->f->jet_version == MDB_VER_JET3) { + /* JET4 needs additional masking with the DB creation date, currently unsupported */ + /* Bug - JET3 supports 20 byte passwords, this is currently just 14 bytes */ + memcpy(mdb->f->db_passwd, mdb->pg_buf + 0x42, sizeof(mdb->f->db_passwd)); + } /* write is not supported for encrypted files yet */ mdb->f->writable = mdb->f->writable && !mdb->f->db_key; - /* get the db password located at 0x42 bytes into the file */ - for (pos=0;pos<14;pos++) { - j = mdb_get_int32(mdb->pg_buf, 0x42+pos); - j ^= key[pos]; - if ( j != 0) - mdb->f->db_passwd[pos] = j; - else - mdb->f->db_passwd[pos] = '\0'; - } - mdb_iconv_init(mdb); return mdb; diff --git a/src/libmdb/iconv.c b/src/libmdb/iconv.c index 87264f1..0a3fdbe 100644 --- a/src/libmdb/iconv.c +++ b/src/libmdb/iconv.c @@ -253,13 +253,30 @@ void mdb_iconv_init(MdbHandle *mdb) mdb->iconv_out = iconv_open("UCS-2LE", iconv_code); mdb->iconv_in = iconv_open(iconv_code, "UCS-2LE"); } else { - /* According to Microsoft Knowledge Base pages 289525 and */ - /* 202427, code page info is not contained in the database */ - const char *jet3_iconv_code; - /* check environment variable */ - if (!(jet3_iconv_code=getenv("MDB_JET3_CHARSET"))) { - jet3_iconv_code="CP1252"; + const char *jet3_iconv_code = getenv("MDB_JET3_CHARSET"); + + if (!jet3_iconv_code) { + /* Use code page embedded in the database */ + /* Note that individual columns can override this value, + * but per-column code pages are not supported by libmdb */ + switch (mdb->f->code_page) { + case 874: jet3_iconv_code="WINDOWS-874"; break; + case 932: jet3_iconv_code="SHIFT-JIS"; break; + case 936: jet3_iconv_code="WINDOWS-936"; break; + case 950: jet3_iconv_code="BIG-5"; break; + case 951: jet3_iconv_code="BIG5-HKSCS"; break; + case 1250: jet3_iconv_code="WINDOWS-1250"; break; + case 1251: jet3_iconv_code="WINDOWS-1251"; break; + case 1252: jet3_iconv_code="WINDOWS-1252"; break; + case 1253: jet3_iconv_code="WINDOWS-1253"; break; + case 1254: jet3_iconv_code="WINDOWS-1254"; break; + case 1255: jet3_iconv_code="WINDOWS-1255"; break; + case 1256: jet3_iconv_code="WINDOWS-1256"; break; + case 1257: jet3_iconv_code="WINDOWS-1257"; break; + case 1258: jet3_iconv_code="WINDOWS-1258"; break; + default: jet3_iconv_code="CP1252"; break; + } } mdb->iconv_out = iconv_open(jet3_iconv_code, iconv_code); From f85905b5c862bf62ce935378a7b4a2b53f65b795 Mon Sep 17 00:00:00 2001 From: Evan Miller Date: Sun, 20 Dec 2020 19:54:51 -0500 Subject: [PATCH 2/2] Add a simple Latin-1 => UTF-8 transcoder w/o iconv --- src/libmdb/iconv.c | 27 +++++++++++++++++++++++---- 1 file changed, 23 insertions(+), 4 deletions(-) diff --git a/src/libmdb/iconv.c b/src/libmdb/iconv.c index 0a3fdbe..a779245 100644 --- a/src/libmdb/iconv.c +++ b/src/libmdb/iconv.c @@ -47,7 +47,7 @@ static size_t decompress_unicode(const char *src, size_t slen, char *dst, size_t } #if HAVE_ICONV -static size_t decompressed2ascii_with_iconv(MdbHandle *mdb, const char *in_ptr, size_t len_in, char *dest, size_t dlen) { +static size_t decompressed_to_utf8_with_iconv(MdbHandle *mdb, const char *in_ptr, size_t len_in, char *dest, size_t dlen) { char *out_ptr = dest; size_t len_out = dlen - 1; @@ -72,8 +72,27 @@ static size_t decompressed2ascii_with_iconv(MdbHandle *mdb, const char *in_ptr, return dlen; } #else -static size_t decompressed2ascii_without_iconv(MdbHandle *mdb, const char *in_ptr, size_t len_in, char *dest, size_t dlen) { +static size_t latin1_to_utf8_without_iconv(const char *in_ptr, size_t len_in, char *dest, size_t dlen) { + char *out = dest; + size_t i; + for(i=0; i> 7); i++) { + unsigned char c = in_ptr[i]; + if(c & 0x80) { + *out++ = 0xC0 | (c >> 6); + *out++ = 0x80 | (c & 0x3F); + } else { + *out++ = c; + } + } + *out = '\0'; + return out - dest; +} + +static size_t decompressed_to_utf8_without_iconv(MdbHandle *mdb, const char *in_ptr, size_t len_in, char *dest, size_t dlen) { if (IS_JET3(mdb)) { + if (mdb->f->code_page == 1252) { + return latin1_to_utf8_without_iconv(in_ptr, len_in, dest, dlen); + } int count = 0; snprintf(dest, dlen, "%.*s%n", (int)len_in, in_ptr, &count); return count; @@ -135,9 +154,9 @@ mdb_unicode2ascii(MdbHandle *mdb, const char *src, size_t slen, char *dest, size } #if HAVE_ICONV - dlen = decompressed2ascii_with_iconv(mdb, in_ptr, len_in, dest, dlen); + dlen = decompressed_to_utf8_with_iconv(mdb, in_ptr, len_in, dest, dlen); #else - dlen = decompressed2ascii_without_iconv(mdb, in_ptr, len_in, dest, dlen); + dlen = decompressed_to_utf8_without_iconv(mdb, in_ptr, len_in, dest, dlen); #endif if (tmp) g_free(tmp);