Merge pull request #42 from evanmiller/mdb-exportjson

mdb-json tool
2025-07-15 14:59:19 +08:00 · 2020-09-02 12:05:02 -04:00 · 2020-09-02 12:05:02 -04:00 · b7dd44d0d4
commit b7dd44d0d4
parent a89df9f5e3 f27b89f60a
8 changed files with 291 additions and 13 deletions
--- a/.gitignore
+++ b/.gitignore
@ -39,6 +39,7 @@ src/sql/parser.h
 src/util/mdb-array
 src/util/mdb-count
 src/util/mdb-export
+src/util/mdb-exportjson
 src/util/mdb-header
 src/util/mdb-import
 src/util/mdb-parsecsv
--- a/.travis.yml
+++ b/.travis.yml
@ -165,8 +165,8 @@ before_script:
 script:
  - ./configure --disable-silent-rules $CONFIGURE_FLAGS
  - make
-  - ./src/util/mdb-array test/data/ASampleDatabase.accdb "Asset Items"
-  - ./src/util/mdb-array test/data/nwind.mdb "Customers"
+  - ./src/util/mdb-json test/data/ASampleDatabase.accdb "Asset Items"
+  - ./src/util/mdb-json test/data/nwind.mdb "Customers"
  - ./src/util/mdb-count test/data/ASampleDatabase.accdb "Asset Items"
  - ./src/util/mdb-count test/data/nwind.mdb "Customers"
  - ./src/util/mdb-prop test/data/ASampleDatabase.accdb "Asset Items"
--- a/README.md
+++ b/README.md
@ -50,7 +50,8 @@ Provides command line utilities, including:
 | ------- | ----------- |
 | `mdb-ver` | Prints the version (JET 3 or 4) of an mdb file. |
 | `mdb-schema` | Prints DDL for the specified table. |
-| `mdb-export` | Export table to CSV format. |
+| `mdb-export` | Export table to CSV or SQL formats. |
+| `mdb-json` | Export table to JSON format. |
 | `mdb-tables` | A simple dump of table names to be used with shell scripts. |
 | `mdb-count` | A simple count of number of rows in a table, to be used in shell scripts and ETL pipelines. |
 | `mdb-header` | Generates a C header to be used in exporting mdb data to a C prog. |
--- a/configure.ac
+++ b/configure.ac
@ -276,6 +276,8 @@ if test x$enable_gtk_doc = xauto ; then
 fi

 AM_CONDITIONAL(ENABLE_GTK_DOC, test x$enable_gtk_doc = xyes)
+AM_CONDITIONAL(HAVE_GNOME_DOC_UTILS, test x$enable_gtk_doc = xyes)
+AM_CONDITIONAL(ENABLE_SK, test x$enable_gtk_doc = xyes)

 ##################################################
 # Check for txt2man
--- a/src/util/Makefile.am
+++ b/src/util/Makefile.am
@ -1,15 +1,7 @@
 AUTOMAKE_OPTIONS = subdir-objects
 SUBDIRS = bash-completion
-bin_PROGRAMS	=	mdb-export mdb-array mdb-schema mdb-tables mdb-parsecsv mdb-header mdb-sql mdb-ver mdb-prop mdb-count mdb-queries
+bin_PROGRAMS	=	mdb-export mdb-array mdb-schema mdb-tables mdb-parsecsv mdb-header mdb-sql mdb-ver mdb-prop mdb-count mdb-queries mdb-json
 noinst_PROGRAMS = mdb-import prtable prcat prdata prkkd prdump prole updrow prindex
-mdb_export_SOURCES = mdb-export.c
-mdb_schema_SOURCES = mdb-schema.c
-mdb_tables_SOURCES = mdb-tables.c
-mdb_sql_SOURCES = mdb-sql.c
-mdb_ver_SOURCES = mdb-ver.c
-mdb_import_SOURCES = mdb-import.c
-mdb_queries_SOURCES = mdb-queries.c
-updrow_SOURCES = updrow.c
 LIBS	=	$(GLIB_LIBS) @LIBS@
 DEFS = @DEFS@ -DLOCALEDIR=\"$(localedir)\"
 AM_CFLAGS	=	-I$(top_srcdir)/include $(GLIB_CFLAGS) -Wsign-compare
--- a/src/util/base64.h
+++ b/src/util/base64.h
@ -0,0 +1,80 @@
+// https://en.wikibooks.org/wiki/Algorithm_Implementation/Miscellaneous/Base64
+
+#include <inttypes.h>
+#include <string.h>
+ 
+// TODO: split on header and implementation
+static int base64encode(const void* data_buf, size_t dataLength, char* result, size_t resultSize)
+{
+   const char base64chars[] = "ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz0123456789+/";
+   const uint8_t *data = (const uint8_t *)data_buf;
+   size_t resultIndex = 0;
+   size_t x;
+   uint32_t n = 0;
+   int padCount = dataLength % 3;
+   uint8_t n0, n1, n2, n3;
+ 
+   /* increment over the length of the string, three characters at a time */
+   for (x = 0; x < dataLength; x += 3) 
+   {
+      /* these three 8-bit (ASCII) characters become one 24-bit number */
+      n = ((uint32_t)data[x]) << 16; //parenthesis needed, compiler depending on flags can do the shifting before conversion to uint32_t, resulting to 0
+ 
+      if((x+1) < dataLength)
+         n += ((uint32_t)data[x+1]) << 8;//parenthesis needed, compiler depending on flags can do the shifting before conversion to uint32_t, resulting to 0
+ 
+      if((x+2) < dataLength)
+         n += data[x+2];
+ 
+      /* this 24-bit number gets separated into four 6-bit numbers */
+      n0 = (uint8_t)(n >> 18) & 63;
+      n1 = (uint8_t)(n >> 12) & 63;
+      n2 = (uint8_t)(n >> 6) & 63;
+      n3 = (uint8_t)n & 63;
+ 
+      /*
+       * if we have one byte available, then its encoding is spread
+       * out over two characters
+       */
+      if(resultIndex >= resultSize) return 1;   /* indicate failure: buffer too small */
+      result[resultIndex++] = base64chars[n0];
+      if(resultIndex >= resultSize) return 1;   /* indicate failure: buffer too small */
+      result[resultIndex++] = base64chars[n1];
+ 
+      /*
+       * if we have only two bytes available, then their encoding is
+       * spread out over three chars
+       */
+      if((x+1) < dataLength)
+      {
+         if(resultIndex >= resultSize) return 1;   /* indicate failure: buffer too small */
+         result[resultIndex++] = base64chars[n2];
+      }
+ 
+      /*
+       * if we have all three bytes available, then their encoding is spread
+       * out over four characters
+       */
+      if((x+2) < dataLength)
+      {
+         if(resultIndex >= resultSize) return 1;   /* indicate failure: buffer too small */
+         result[resultIndex++] = base64chars[n3];
+      }
+   }  
+ 
+   /*
+    * create and add padding that is required if we did not have a multiple of 3
+    * number of characters available
+    */
+   if (padCount > 0) 
+   { 
+      for (; padCount < 3; padCount++) 
+      { 
+         if(resultIndex >= resultSize) return 1;   /* indicate failure: buffer too small */
+         result[resultIndex++] = '=';
+      } 
+   }
+   if(resultIndex >= resultSize) return 1;   /* indicate failure: buffer too small */
+   result[resultIndex] = 0;
+   return 0;   /* indicate success */
+}
--- a/src/util/mdb-export.c
+++ b/src/util/mdb-export.c
@ -107,7 +107,7 @@ main(int argc, char **argv)
 		{"row-delimiter", 'R', 0, G_OPTION_ARG_STRING, &row_delimiter, "Specify a row delimiter", "char"},
 		{"quote", 'q', 0, G_OPTION_ARG_STRING, &quote_char, "Use <char> to wrap text-like fields. Default is double quote.", "char"},
 		{"backend", 'I', 0, G_OPTION_ARG_STRING, &insert_dialect, "INSERT statements (instead of CSV)", "backend"},
-		{"date_format", 'D', 0, G_OPTION_ARG_STRING, &date_fmt, "Set the date format (see strftime(3) for details)", "format"},
+		{"date-format", 'D', 0, G_OPTION_ARG_STRING, &date_fmt, "Set the date format (see strftime(3) for details)", "format"},
 		{"escape", 'X', 0, G_OPTION_ARG_STRING, &escape_char, "Use <char> to escape quoted characters within a field. Default is doubling.", "format"},
 		{"namespace", 'N', 0, G_OPTION_ARG_STRING, &namespace, "Prefix identifiers with namespace", "namespace"},
 		{"null", '0', 0, G_OPTION_ARG_STRING, &null_text, "Use <char> to represent a NULL value", "char"},
--- a/src/util/mdb-json.c
+++ b/src/util/mdb-json.c
@ -0,0 +1,202 @@
+/* MDB Tools - A library for reading MS Access database file
+ * Copyright (C) 2000 Brian Bruns
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License along
+ * with this program; if not, write to the Free Software Foundation, Inc.,
+ * 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
+ */
+
+#include "mdbtools.h"
+
+#include "base64.h"
+
+#undef MDB_BIND_SIZE
+#define MDB_BIND_SIZE 200000
+
+#define is_quote_type(x) (x==MDB_TEXT || x==MDB_OLE || x==MDB_MEMO || x==MDB_DATETIME || x==MDB_BINARY || x==MDB_REPID)
+#define is_binary_type(x) (x==MDB_OLE || x==MDB_BINARY || x==MDB_REPID)
+
+static char *quote_char = "\"";
+static char *escape_char = "\\";
+static char *separator_char = ":";
+static char *row_start = "{";
+static char *row_end = "}\n";
+static char *delimiter = ",";
+static size_t quote_len = 1; //strlen(quote_char); /* multibyte */
+static size_t orig_escape_len = 1; //strlen(escape_char);
+static int drop_nonascii = 0;
+
+//#define DONT_ESCAPE_ESCAPE
+static void
+print_quoted_value(FILE *outfile, char* value, int bin_len) {
+	fputs(quote_char, outfile);
+	int is_binary = (bin_len != -1);
+	while (1) {
+		if (is_binary) {
+			if (!bin_len--)
+				break;
+		} else /* use \0 sentry */
+			if (!*value)
+				break;
+
+		if (quote_len && !strncmp(value, quote_char, quote_len)) {
+			fprintf(outfile, "%s%s", escape_char, quote_char);
+			value += quote_len;
+#ifndef DONT_ESCAPE_ESCAPE
+		} else if (orig_escape_len && !strncmp(value, escape_char, orig_escape_len)) {
+			fprintf(outfile, "%s%s", escape_char, escape_char);
+			value += orig_escape_len;
+#endif
+		} else if ((unsigned char)*value < 0x20) {
+			if (drop_nonascii) {
+				putc(' ', outfile);
+				++value;
+			} else {
+				// escape control codes / binary data.
+				fprintf(outfile, "\\u00%02x", *(unsigned char*)value++);
+			}
+		} else {
+			putc(*value++, outfile);
+		}
+	}
+	fputs(quote_char, outfile);
+}
+
+static void
+print_binary_value(FILE *outfile, char const * value, int bin_len) {
+	fputs("{\"$binary\": \"", outfile);
+	size_t const base64_buf_len = (bin_len / 3 + 1) * 4 + 1;
+	char * base64_buf = g_malloc(base64_buf_len);
+	if (base64encode(value, bin_len, base64_buf, base64_buf_len) != 0) {
+		fprintf(stderr, "Error: Base64 serialization failed.\n");
+	}
+	fputs(base64_buf, outfile);
+	g_free(base64_buf);
+	fputs("\", \"$type\": \"00\"}", outfile);
+}
+
+static void
+print_col(FILE *outfile, char* col_name, gchar *col_val, int col_type, int bin_len) {
+	print_quoted_value(outfile, col_name, -1);
+	fputs(separator_char, outfile);
+	if (is_quote_type(col_type)) {
+		if (is_binary_type(col_type)) {
+			print_binary_value(outfile, col_val, bin_len);
+			bin_len = -1;
+		} else {
+			print_quoted_value(outfile, col_val, bin_len);
+		}
+	} else
+		fputs(col_val, outfile);
+}
+int
+main(int argc, char **argv)
+{
+	unsigned int i;
+	MdbHandle *mdb;
+	MdbTableDef *table;
+	MdbColumn *col;
+	char **bound_values;
+	int  *bound_lens;
+	FILE *outfile = stdout;
+	char *date_fmt = NULL;
+	char *value;
+	size_t length;
+
+	GOptionEntry entries[] = {
+		{"date-format", 'D', 0, G_OPTION_ARG_STRING, &date_fmt, "Set the date format (see strftime(3) for details)", "format"},
+		{"no-unprintable", 'U', 0, G_OPTION_ARG_NONE, &drop_nonascii, "Change unprintable characters to spaces (otherwise escaped as \\u00XX)", NULL},
+        {NULL}
+    };
+
+	GError *error = NULL;
+	GOptionContext *opt_context;
+
+	opt_context = g_option_context_new("<file> <table> - export data from Access file to JSON");
+	g_option_context_add_main_entries(opt_context, entries, NULL /*i18n*/);
+	if (!g_option_context_parse (opt_context, &argc, &argv, &error))
+	{
+		fprintf(stderr, "option parsing failed: %s\n", error->message);
+		fputs(g_option_context_get_help(opt_context, TRUE, NULL), stderr);
+		exit (1);
+	}
+
+	if (argc != 3) {
+		fputs("Wrong number of arguments.\n\n", stderr);
+		fputs(g_option_context_get_help(opt_context, TRUE, NULL), stderr);
+		exit(1);
+	}
+
+	if (!(mdb = mdb_open(argv[1], MDB_NOFLAGS))) {
+		exit(1);
+	}
+
+	if (date_fmt)
+		mdb_set_date_fmt(mdb, date_fmt);
+
+	table = mdb_read_table_by_name(mdb, argv[2], MDB_TABLE);
+	if (!table) {
+		fprintf(stderr, "Error: Table %s does not exist in this database.\n", argv[argc-1]);
+		mdb_close(mdb);
+		exit(1);
+	}
+
+	/* read table */
+	mdb_read_columns(table);
+	mdb_rewind_table(table);
+
+	bound_values = (char **) g_malloc(table->num_cols * sizeof(char *));
+	bound_lens = (int *) g_malloc(table->num_cols * sizeof(int));
+	for (i=0;i<table->num_cols;i++) {
+		/* bind columns */
+		bound_values[i] = (char *) g_malloc0(MDB_BIND_SIZE);
+		mdb_bind_column(table, i+1, bound_values[i], &bound_lens[i]);
+	}
+
+	while(mdb_fetch_row(table)) {
+		fputs(row_start, outfile);
+		int add_delimiter = 0;
+		for (i=0;i<table->num_cols;i++) {
+			col=g_ptr_array_index(table->columns,i);
+			if (bound_lens[i]) {
+				if (add_delimiter) {
+					fputs(delimiter, outfile);
+					add_delimiter = 0;
+				}
+
+				if (col->col_type == MDB_OLE) {
+					value = mdb_ole_read_full(mdb, col, &length);
+				} else {
+					value = bound_values[i];
+					length = bound_lens[i];
+				}
+				print_col(outfile, col->name, value, col->col_type, length);
+				add_delimiter = 1;
+				if (col->col_type == MDB_OLE)
+					free(value);
+			}
+		}
+		fputs(row_end, outfile);
+	}
+
+	/* free the memory used to bind */
+	for (i=0;i<table->num_cols;i++) {
+		g_free(bound_values[i]);
+	}
+	g_free(bound_values);
+	g_free(bound_lens);
+	mdb_free_tabledef(table);
+
+	mdb_close(mdb);
+	return 0;
+}