initial commit

This commit is contained in:
yosh 2024-01-27 19:59:21 -05:00
commit dd84304530
4 changed files with 319 additions and 0 deletions

24
LICENSE Normal file
View File

@ -0,0 +1,24 @@
This is free and unencumbered software released into the public domain.
Anyone is free to copy, modify, publish, use, compile, sell, or
distribute this software, either in source code form or as a compiled
binary, for any purpose, commercial or non-commercial, and by any
means.
In jurisdictions that recognize copyright laws, the author or authors
of this software dedicate any and all copyright interest in the
software to the public domain. We make this dedication for the benefit
of the public at large and to the detriment of our heirs and
successors. We intend this dedication to be an overt act of
relinquishment in perpetuity of all present and future rights to this
software under copyright law.
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.
IN NO EVENT SHALL THE AUTHORS BE LIABLE FOR ANY CLAIM, DAMAGES OR
OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE,
ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR
OTHER DEALINGS IN THE SOFTWARE.
For more information, please refer to <http://unlicense.org/>

16
Makefile Normal file
View File

@ -0,0 +1,16 @@
.POSIX:
PREFIX = /usr/local
CFLAGS = -O2
build: bson2json
bson2json: bson2json.c
clean:
rm -f bson2json
install: bson2json
mkdir -p $(DESTDIR)$(PREFIX)/bin
cp $< $(DESTDIR)$(PREFIX)/bin
uninstall:
rm -f $(DESTDIR)$(PREFIX)/bin/bson2json

34
README.md Normal file
View File

@ -0,0 +1,34 @@
# bson2json
a dead-simple no-frills pipeline utility for converting mongodb's [BSON](https://bsonspec.org/) format to JSON.
## building and installing
```
make
make install
```
by default installs in `/usr/local/bin/bson2json`
## usage
`bson2json` does not take any arguments. it only reads stdin and outputs to stdout. as such, common usage is like so:
```
bson2json < bson_file.bson > json_file.json
# or, a more complicated example...
bsonurl=$(curl https://some.server/api.php | jq -r '.link.filter')
curl "$bsonurl" | bson2json | jq -r '.filter.to.a.specific.value'
```
`bson2json` has *very minimal* error checking. it assumes that the bson files you give it will be valid. it doesn't tell you what byte errors occur (that'd be weird). if an unrecoverable error occurs, the exit code will be nonzero. as such, if you're not 100% sure that the bson files you are giving it will be valid, perhaps have a setup like so:
```
bson2json < bson_file.bson >/dev/null && bson2json < bson_file.bson | jq 'some_filter'
```
## fallbacks
- this diverges from the "official" `libbson` way of converting bson to json, because I think some of the official ways suck
- because json is much more stripped-down type wise than bson, some information is lost or converted when converting. notably:
- binary data is converted to hex as a string, and doesn't specify what "type" of binary data it is
- a lot of types that are put in a nested document with a key for their type are simply simplified to either remove the key or replace it with the json-specific type itself (e.g. double)
- that's all I remember
- because proper c23 support for `_Decimal128` numbers (`0x13` for bson) isn't really all too there in c compilers as of writing this and I did not want to roll my own implementation of them, decimal128 numbers are represented as a binary string for the time being
## testing
I tested this on [the libbson test suite](https://github.com/mongodb/mongo-c-driver/tree/master/src/libbson/tests/binary) and everything looked fine, only failing on the tests that are meant to fail. all the output was valid json too, so we're good on that front as well

245
bson2json.c Normal file
View File

@ -0,0 +1,245 @@
#include <stdarg.h>
#include <stdint.h>
#include <stdio.h>
#include <stdlib.h>
#include <string.h>
#include <unistd.h>
#define DATABUF_LEN 16
char DATA_BUFFER[DATABUF_LEN]; // data buffer for funny casts
void
die(char *format, ...) {
va_list args;
va_start(args, format);
vfprintf(stderr, format, args);
va_end(args);
exit(1);
}
/* this function performs required json escaping from stdin
* and prints out the correct stuff
*/
void
json_string_stdin(int32_t *doc_size)
{
int c;
putchar('"');
for (c = getchar(); c != 0x00 && c != EOF; c = getchar()) {
*doc_size -= 1;
switch (c) {
case '"':
fputs("\\\"", stdout);
break;
case '\\':
fputs("\\\\", stdout);
break;
case '/':
fputs("\\/", stdout);
break;
case '\b':
fputs("\\b", stdout);
break;
case '\f':
fputs("\\f", stdout);
break;
case '\n':
fputs("\\n", stdout);
break;
case '\r':
fputs("\\r", stdout);
break;
case '\t':
fputs("\\t", stdout);
break;
default:
if (c <= 0x1F) {
fprintf(stderr, "warning: invalid control character %x in string, ignoring\n", c);
continue;
}
putchar(c);
}
}
putchar('"');
*doc_size -= 1;
}
/* this function simply discards stdin while removing size
* this is for array keys in an array
*/
void
json_trash_string_stdin(int32_t *doc_size)
{
while (getchar() != 0x00) {
*doc_size -= 1;
}
*doc_size -= 1;
}
// prints bson document as json, returns complete size of the document
int32_t
print_bson(uint8_t is_array)
{
int32_t size;
int32_t size_ret;
int c;
fread(&size, 4, 1, stdin);
size_ret = size;
size -= 4;
if (is_array) {
putchar('[');
} else {
putchar('{');
}
while (1) {
c = getchar();
size -= 1;
if (c == 0x00) {
if (size == 0) {
if (is_array) {
putchar(']');
} else {
putchar('}');
}
return size_ret;
}
die("bson2json: error: 0x00 byte reached prematurely! check your data.\n");
}
if (is_array) {
json_trash_string_stdin(&size);
} else {
json_string_stdin(&size);
putchar(':');
}
switch (c) {
case 0x01: // double
fread(&DATA_BUFFER, 8, 1, stdin);
printf("%.19f", *((double*)DATA_BUFFER));
size -= 8;
break;
case 0x02: case 0x0D: case 0x0E: // string, javascript code, symbol
fread(&DATA_BUFFER, 4, 1, stdin);
size -= 4;
json_string_stdin(&size);
break;
case 0x03: // document
size -= print_bson(0);
break;
case 0x04: // array
size -= print_bson(1);
break;
case 0x05: // binary
fread(&DATA_BUFFER, 4, 1, stdin);
getchar(); // just throw the next char away
size -= 5 + *((int32_t*)DATA_BUFFER);
putchar('"');
while (*((int32_t*)DATA_BUFFER) > 0) {
c = getchar();
printf("%02x", c);
*((int32_t*)DATA_BUFFER) -= 1;
}
putchar('"');
break;
case 0x06: // undefined (deprecated)
fputs("null", stdout);
break;
case 0x07: // objectid
fread(&DATA_BUFFER, 4, 1, stdin);
printf("{\"$ObjectId:timestamp\":%u,", *((uint32_t*)DATA_BUFFER));
memset(DATA_BUFFER, 0, 8);
fread(&DATA_BUFFER, 5, 1, stdin);
printf("\"$ObjectId:rand\":%llu,", *((uint64_t*)DATA_BUFFER));
memset(DATA_BUFFER, 0, 4);
fread(&DATA_BUFFER, 3, 1, stdin);
printf("\"$ObjectId:counter\":%u}", *((uint32_t*)DATA_BUFFER));
size -= 12;
break;
case 0x08: // boolean
fread(&DATA_BUFFER, 1, 1, stdin);
if (*DATA_BUFFER == 0x00) {
fputs("false", stdout);
} else {
// technically it should only be true if it == 0x01
// and invalid otherwise but idc
fputs("true", stdout);
}
size -= 1;
break;
case 0x09: case 0x12: // 64 bit ints (utc datetime and int64)
fread(&DATA_BUFFER, 8, 1, stdin);
printf("%lld", *((int64_t*)DATA_BUFFER));
size -= 8;
break;
case 0x0A: // null
fputs("null", stdout);
break;
case 0x0B: // regex
fputs("{\"$regex:pattern\":", stdout);
json_string_stdin(&size);
fputs(",\"$regex:options\":", stdout);
json_string_stdin(&size);
putchar('}');
break;
case 0x0C: // dbpointer
fread(&DATA_BUFFER, 4, 1, stdin);
fputs("{\"$DBPointer:string\":", stdout);
json_string_stdin(&size);
memset(&DATA_BUFFER, 0, 16);
fread(&DATA_BUFFER, 12, 1, stdin);
printf(",\"$DBPointer:pointer\":%llu}", *((uint64_t*)DATA_BUFFER));
size -= 16;
break;
case 0x0F: // code with scope
fread(&DATA_BUFFER, 4, 1, stdin); // entire
fread(&DATA_BUFFER, 4, 1, stdin); // string
fputs("{\"$code_w_s:code\":", stdout);
json_string_stdin(&size);
fputs(",\"$code_w_s:scope\":", stdout);
size -= 8 + print_bson(0);
putchar('}');
break;
case 0x10: // 32 bit int
fread(&DATA_BUFFER, 4, 1, stdin);
printf("%d", *((int32_t*)DATA_BUFFER));
size -= 4;
break;
case 0x11: // timestamp (uint64)
fread(&DATA_BUFFER, 8, 1, stdin);
printf("%llu", *((uint64_t*)DATA_BUFFER));
size -= 8;
break;
case 0x13: // 128-bit decimal
fputs("warning: decimal128 number in input. these are a binary string until c23 support is better\n", stderr);
putchar('"');
for (uint8_t i = 0; i < 16; i++) {
printf("%02x", getchar());
}
putchar('"');
size -= 16;
break;
case 0xFF: case 0x7F: // min/max type idc about these
fputs("null", stdout);
break;
default:
die("bson2json: error: unknown byte for item type. this is most likely the result of some other data bug.\n");
}
if (size > 1) {
putchar(',');
}
}
}
int
main(int argc, char **argv)
{
print_bson(0);
printf("\n");
return 0;
}