commit dd84304530b1adaef5dc435db4f796e096ee33c2 Author: yosh Date: Sat Jan 27 19:59:21 2024 -0500 initial commit diff --git a/LICENSE b/LICENSE new file mode 100644 index 0000000..68a49da --- /dev/null +++ b/LICENSE @@ -0,0 +1,24 @@ +This is free and unencumbered software released into the public domain. + +Anyone is free to copy, modify, publish, use, compile, sell, or +distribute this software, either in source code form or as a compiled +binary, for any purpose, commercial or non-commercial, and by any +means. + +In jurisdictions that recognize copyright laws, the author or authors +of this software dedicate any and all copyright interest in the +software to the public domain. We make this dedication for the benefit +of the public at large and to the detriment of our heirs and +successors. We intend this dedication to be an overt act of +relinquishment in perpetuity of all present and future rights to this +software under copyright law. + +THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, +EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF +MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. +IN NO EVENT SHALL THE AUTHORS BE LIABLE FOR ANY CLAIM, DAMAGES OR +OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, +ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR +OTHER DEALINGS IN THE SOFTWARE. + +For more information, please refer to diff --git a/Makefile b/Makefile new file mode 100644 index 0000000..549d3e2 --- /dev/null +++ b/Makefile @@ -0,0 +1,16 @@ +.POSIX: +PREFIX = /usr/local +CFLAGS = -O2 + +build: bson2json +bson2json: bson2json.c + +clean: + rm -f bson2json + +install: bson2json + mkdir -p $(DESTDIR)$(PREFIX)/bin + cp $< $(DESTDIR)$(PREFIX)/bin + +uninstall: + rm -f $(DESTDIR)$(PREFIX)/bin/bson2json diff --git a/README.md b/README.md new file mode 100644 index 0000000..24811f8 --- /dev/null +++ b/README.md @@ -0,0 +1,34 @@ +# bson2json +a dead-simple no-frills pipeline utility for converting mongodb's [BSON](https://bsonspec.org/) format to JSON. + +## building and installing +``` +make +make install +``` +by default installs in `/usr/local/bin/bson2json` + +## usage +`bson2json` does not take any arguments. it only reads stdin and outputs to stdout. as such, common usage is like so: +``` +bson2json < bson_file.bson > json_file.json + +# or, a more complicated example... +bsonurl=$(curl https://some.server/api.php | jq -r '.link.filter') +curl "$bsonurl" | bson2json | jq -r '.filter.to.a.specific.value' +``` +`bson2json` has *very minimal* error checking. it assumes that the bson files you give it will be valid. it doesn't tell you what byte errors occur (that'd be weird). if an unrecoverable error occurs, the exit code will be nonzero. as such, if you're not 100% sure that the bson files you are giving it will be valid, perhaps have a setup like so: +``` +bson2json < bson_file.bson >/dev/null && bson2json < bson_file.bson | jq 'some_filter' +``` + +## fallbacks +- this diverges from the "official" `libbson` way of converting bson to json, because I think some of the official ways suck +- because json is much more stripped-down type wise than bson, some information is lost or converted when converting. notably: + - binary data is converted to hex as a string, and doesn't specify what "type" of binary data it is + - a lot of types that are put in a nested document with a key for their type are simply simplified to either remove the key or replace it with the json-specific type itself (e.g. double) + - that's all I remember +- because proper c23 support for `_Decimal128` numbers (`0x13` for bson) isn't really all too there in c compilers as of writing this and I did not want to roll my own implementation of them, decimal128 numbers are represented as a binary string for the time being + +## testing +I tested this on [the libbson test suite](https://github.com/mongodb/mongo-c-driver/tree/master/src/libbson/tests/binary) and everything looked fine, only failing on the tests that are meant to fail. all the output was valid json too, so we're good on that front as well diff --git a/bson2json.c b/bson2json.c new file mode 100644 index 0000000..fb2e066 --- /dev/null +++ b/bson2json.c @@ -0,0 +1,245 @@ +#include +#include +#include +#include +#include +#include + +#define DATABUF_LEN 16 +char DATA_BUFFER[DATABUF_LEN]; // data buffer for funny casts + +void +die(char *format, ...) { + va_list args; + va_start(args, format); + vfprintf(stderr, format, args); + va_end(args); + + exit(1); +} + +/* this function performs required json escaping from stdin + * and prints out the correct stuff + */ +void +json_string_stdin(int32_t *doc_size) +{ + int c; + putchar('"'); + for (c = getchar(); c != 0x00 && c != EOF; c = getchar()) { + *doc_size -= 1; + switch (c) { + case '"': + fputs("\\\"", stdout); + break; + case '\\': + fputs("\\\\", stdout); + break; + case '/': + fputs("\\/", stdout); + break; + case '\b': + fputs("\\b", stdout); + break; + case '\f': + fputs("\\f", stdout); + break; + case '\n': + fputs("\\n", stdout); + break; + case '\r': + fputs("\\r", stdout); + break; + case '\t': + fputs("\\t", stdout); + break; + default: + if (c <= 0x1F) { + fprintf(stderr, "warning: invalid control character %x in string, ignoring\n", c); + continue; + } + putchar(c); + } + } + putchar('"'); + *doc_size -= 1; +} + +/* this function simply discards stdin while removing size + * this is for array keys in an array + */ +void +json_trash_string_stdin(int32_t *doc_size) +{ + while (getchar() != 0x00) { + *doc_size -= 1; + } + *doc_size -= 1; +} + +// prints bson document as json, returns complete size of the document +int32_t +print_bson(uint8_t is_array) +{ + int32_t size; + int32_t size_ret; + int c; + fread(&size, 4, 1, stdin); + size_ret = size; + size -= 4; + + if (is_array) { + putchar('['); + } else { + putchar('{'); + } + + while (1) { + c = getchar(); + size -= 1; + + if (c == 0x00) { + if (size == 0) { + if (is_array) { + putchar(']'); + } else { + putchar('}'); + } + return size_ret; + } + die("bson2json: error: 0x00 byte reached prematurely! check your data.\n"); + } + + if (is_array) { + json_trash_string_stdin(&size); + } else { + json_string_stdin(&size); + putchar(':'); + } + + switch (c) { + case 0x01: // double + fread(&DATA_BUFFER, 8, 1, stdin); + printf("%.19f", *((double*)DATA_BUFFER)); + size -= 8; + break; + case 0x02: case 0x0D: case 0x0E: // string, javascript code, symbol + fread(&DATA_BUFFER, 4, 1, stdin); + size -= 4; + json_string_stdin(&size); + break; + case 0x03: // document + size -= print_bson(0); + break; + case 0x04: // array + size -= print_bson(1); + break; + case 0x05: // binary + fread(&DATA_BUFFER, 4, 1, stdin); + getchar(); // just throw the next char away + size -= 5 + *((int32_t*)DATA_BUFFER); + putchar('"'); + while (*((int32_t*)DATA_BUFFER) > 0) { + c = getchar(); + printf("%02x", c); + *((int32_t*)DATA_BUFFER) -= 1; + } + putchar('"'); + break; + case 0x06: // undefined (deprecated) + fputs("null", stdout); + break; + case 0x07: // objectid + fread(&DATA_BUFFER, 4, 1, stdin); + printf("{\"$ObjectId:timestamp\":%u,", *((uint32_t*)DATA_BUFFER)); + memset(DATA_BUFFER, 0, 8); + fread(&DATA_BUFFER, 5, 1, stdin); + printf("\"$ObjectId:rand\":%llu,", *((uint64_t*)DATA_BUFFER)); + memset(DATA_BUFFER, 0, 4); + fread(&DATA_BUFFER, 3, 1, stdin); + printf("\"$ObjectId:counter\":%u}", *((uint32_t*)DATA_BUFFER)); + size -= 12; + break; + case 0x08: // boolean + fread(&DATA_BUFFER, 1, 1, stdin); + if (*DATA_BUFFER == 0x00) { + fputs("false", stdout); + } else { + // technically it should only be true if it == 0x01 + // and invalid otherwise but idc + fputs("true", stdout); + } + size -= 1; + break; + case 0x09: case 0x12: // 64 bit ints (utc datetime and int64) + fread(&DATA_BUFFER, 8, 1, stdin); + printf("%lld", *((int64_t*)DATA_BUFFER)); + size -= 8; + break; + case 0x0A: // null + fputs("null", stdout); + break; + case 0x0B: // regex + fputs("{\"$regex:pattern\":", stdout); + json_string_stdin(&size); + fputs(",\"$regex:options\":", stdout); + json_string_stdin(&size); + putchar('}'); + break; + case 0x0C: // dbpointer + fread(&DATA_BUFFER, 4, 1, stdin); + fputs("{\"$DBPointer:string\":", stdout); + json_string_stdin(&size); + memset(&DATA_BUFFER, 0, 16); + fread(&DATA_BUFFER, 12, 1, stdin); + printf(",\"$DBPointer:pointer\":%llu}", *((uint64_t*)DATA_BUFFER)); + size -= 16; + break; + case 0x0F: // code with scope + fread(&DATA_BUFFER, 4, 1, stdin); // entire + fread(&DATA_BUFFER, 4, 1, stdin); // string + fputs("{\"$code_w_s:code\":", stdout); + json_string_stdin(&size); + fputs(",\"$code_w_s:scope\":", stdout); + size -= 8 + print_bson(0); + putchar('}'); + break; + case 0x10: // 32 bit int + fread(&DATA_BUFFER, 4, 1, stdin); + printf("%d", *((int32_t*)DATA_BUFFER)); + size -= 4; + break; + case 0x11: // timestamp (uint64) + fread(&DATA_BUFFER, 8, 1, stdin); + printf("%llu", *((uint64_t*)DATA_BUFFER)); + size -= 8; + break; + case 0x13: // 128-bit decimal + fputs("warning: decimal128 number in input. these are a binary string until c23 support is better\n", stderr); + putchar('"'); + for (uint8_t i = 0; i < 16; i++) { + printf("%02x", getchar()); + } + putchar('"'); + size -= 16; + break; + case 0xFF: case 0x7F: // min/max type idc about these + fputs("null", stdout); + break; + default: + die("bson2json: error: unknown byte for item type. this is most likely the result of some other data bug.\n"); + } + if (size > 1) { + putchar(','); + } + } +} + +int +main(int argc, char **argv) +{ + print_bson(0); + printf("\n"); + + return 0; +}