diff options
author | Charlie Stanton <charlie@shtanton.xyz> | 2021-10-04 14:11:29 +0100 |
---|---|---|
committer | Charlie Stanton <charlie@shtanton.xyz> | 2021-10-04 14:11:29 +0100 |
commit | 14e929a7596baa7ee9da401975c521aa0e93c3b4 (patch) | |
tree | 90459c8dd3b5756d2da409e8dbfdd19420e7e882 | |
parent | d973b4b8eaaa5e7f1d10f9f02069c6d7bde25dcc (diff) | |
download | cudl-14e929a7596baa7ee9da401975c521aa0e93c3b4.tar |
Add string and map parsing. Currently no support for the schema variants of the syntax for these types
-rw-r--r-- | cudl.c | 346 | ||||
-rw-r--r-- | cudl.h | 10 | ||||
-rw-r--r-- | spec.txt | 6 | ||||
-rw-r--r-- | test.cudl | 7 |
4 files changed, 363 insertions, 6 deletions
@@ -4,7 +4,13 @@ #include <string.h> #include "cudl.h" -#define STRIP_WHITESPACE(text) while (isspace(*text)) text++ +#define STRIP_WHITESPACE(text) while (isspace(*(text))) (text)++ +#define IS_KEY_CHAR(c) (\ + 'a' <= (c) && (c) <= 'z' ||\ + 'A' <= (c) && (c) <= 'Z' ||\ + '0' <= (c) && (c) <= '9' ||\ + (c) == '_' || (c) == '-'\ +) int cudl_err = CUDL_OK; @@ -37,12 +43,23 @@ void cudl_debug(struct cudl_value value) { else printf("%%false"); break; + case CUDL_TAG_STRING: + printf("\"%s\"", value.data.string); + break;; case CUDL_TAG_ARRAY: printf("["); for (i = 0; i < value.data.array.length; i++) cudl_debug(value.data.array.values[i]); printf("]"); break; + case CUDL_TAG_MAP: + printf("{"); + for (i = 0; i < value.data.map.length; i++) { + printf("\"%s\": ", value.data.map.fields[i].key); + cudl_debug(value.data.map.fields[i].value); + } + printf("}"); + break; default: printf("UNKNOWN"); break; @@ -89,6 +106,210 @@ static size_t parse_bool_or_null(char *input, struct cudl_value *value) { return 0; } +/* Convert UCS character to utf-8 bytes. + * Return number of bytes generated. + * Sets cudl_error on error. + * Shamelessly lifted from https://github.com/cktan/tomc99 */ +size_t cudl_ucs_to_utf8(int64_t ucs, char utf8[6]) { + if ( + 0xd800 <= ucs && ucs <= 0xdfff || + 0xfffe <= ucs && ucs <= 0xffff || + ucs < 0 + ) { + cudl_err = CUDL_ERR_UNRECOGNISED_UNICODE; + return 0; + } + + /* 0x00000000 - 0x0000007F: + 0xxxxxxx + */ + if (ucs <= 0x7F) { + utf8[0] = (unsigned char) ucs; + return 1; + } + + /* 0x00000080 - 0x000007FF: + 110xxxxx 10xxxxxx + */ + if (ucs <= 0x000007FF) { + utf8[0] = 0xc0 | (ucs >> 6); + utf8[1] = 0x80 | (ucs & 0x3f); + return 2; + } + + /* 0x00000800 - 0x0000FFFF: + 1110xxxx 10xxxxxx 10xxxxxx + */ + if (ucs <= 0x0000FFFF) { + utf8[0] = 0xe0 | (ucs >> 12); + utf8[1] = 0x80 | ((ucs >> 6) & 0x3f); + utf8[2] = 0x80 | (ucs & 0x3f); + return 3; + } + + /* 0x00010000 - 0x001FFFFF: + 11110xxx 10xxxxxx 10xxxxxx 10xxxxxx + */ + if (ucs <= 0x001FFFFF) { + utf8[0] = 0xf0 | (ucs >> 18); + utf8[1] = 0x80 | ((ucs >> 12) & 0x3f); + utf8[2] = 0x80 | ((ucs >> 6) & 0x3f); + utf8[3] = 0x80 | (ucs & 0x3f); + return 4; + } + + /* 0x00200000 - 0x03FFFFFF: + 111110xx 10xxxxxx 10xxxxxx 10xxxxxx 10xxxxxx + */ + if (ucs <= 0x03FFFFFF) { + utf8[0] = 0xf8 | (ucs >> 24); + utf8[1] = 0x80 | ((ucs >> 18) & 0x3f); + utf8[2] = 0x80 | ((ucs >> 12) & 0x3f); + utf8[3] = 0x80 | ((ucs >> 6) & 0x3f); + utf8[4] = 0x80 | (ucs & 0x3f); + return 5; + } + + /* 0x04000000 - 0x7FFFFFFF: + 1111110x 10xxxxxx 10xxxxxx 10xxxxxx 10xxxxxx 10xxxxxx + */ + if (ucs <= 0x7FFFFFFF) { + utf8[0] = 0xfc | (ucs >> 30); + utf8[1] = 0x80 | ((ucs >> 24) & 0x3f); + utf8[2] = 0x80 | ((ucs >> 18) & 0x3f); + utf8[3] = 0x80 | ((ucs >> 12) & 0x3f); + utf8[4] = 0x80 | ((ucs >> 6) & 0x3f); + utf8[5] = 0x80 | (ucs & 0x3f); + return 6; + } + + cudl_err = CUDL_ERR_UNRECOGNISED_UNICODE; + return 0; +} + +/* Parse a string starting after the opening quote. + * Set string to be the contents of the string. + * No memory is allocated if an error occurs. */ +static size_t parse_quoted_string(char *input, char **string) { + size_t length, capacity; + char *original_input, *newstring; + int64_t ucs; + int ucs_length, i; + + length = 0; + capacity = 32; + original_input = input; + if ((*string = malloc(capacity)) == NULL) { + cudl_err = CUDL_ERR_OUT_OF_MEMORY; + return 0; + } + for (;;) { + if (*input == '\0') { + cudl_err = CUDL_ERR_UNMATCHED_QUOTE; + free(*string); + return 0; + } + if (*input == '"') { + if ((newstring = realloc(*string, length + 1)) == NULL) { + cudl_err = CUDL_ERR_OUT_OF_MEMORY; + free(*string); + return 0; + } + *string = newstring; + (*string)[length] = '\0'; + input++; + return input - original_input; + } + if (length >= capacity) { + if ((newstring = realloc(*string, capacity * 2)) == NULL) { + cudl_err = CUDL_ERR_OUT_OF_MEMORY; + free(*string); + return 0; + } + *string = newstring; + capacity *= 2; + } + if (*input == '\\') { + input++; + switch (*input) { + case '\0': + cudl_err = CUDL_ERR_EXPECTED_ESCAPE_SEQUENCE; + free(*string); + return 0; + case 'b': + (*string)[length++] = '\b'; + input++; + break; + case 't': + (*string)[length++] = '\t'; + input++; + break; + case 'n': + (*string)[length++] = '\n'; + input++; + break; + case 'r': + (*string)[length++] = '\r'; + input++; + break; + case '"': + (*string)[length++] = '"'; + input++; + break; + case '\\': + (*string)[length++] = '\\'; + input++; + break; + case 'u': + case 'U': + ucs = 0; + ucs_length = (*input == 'u') ? 4 : 8; + input++; + for (i = 0; i < ucs_length; i++) { + if (input[i] == '\0') { + cudl_err = CUDL_ERR_EXPECTED_ESCAPE_SEQUENCE; + free(*string); + return 0; + } + if ('0' <= input[i] && input[i] <= '9') { + ucs = (ucs << 4) + (input[i] - '0'); + } else if ('a' <= input[i] && input[i] <= 'z') { + ucs = (ucs << 4) + (input[i] - 'a' + 10); + } else if ('A' <= input[i] && input[i] <= 'Z') { + ucs = (ucs << 4) + (input[i] - 'A' + 10); + } else { + cudl_err = CUDL_ERR_EXPECTED_ESCAPE_SEQUENCE; + free(*string); + return 0; + } + } + if (length + 6 > capacity) { + if ((newstring = realloc(*string, capacity * 2)) == NULL) { + cudl_err = CUDL_ERR_OUT_OF_MEMORY; + free(*string); + return 0; + } + *string = newstring; + capacity *= 2; + } + length += cudl_ucs_to_utf8(ucs, (*string) + length); + if (cudl_err) { + free(*string); + return 0; + } + input += ucs_length; + break; + default: + (*string)[length++] = *input; + input++; + break; + } + } else { + (*string)[length++] = *(input++); + } + } +} + static size_t parse_array(char *input, struct cudl_value *value) { size_t length, capacity; struct cudl_value *values, *newvalues; @@ -127,13 +348,13 @@ static size_t parse_array(char *input, struct cudl_value *value) { free(values); return 0; } + values = newvalues; capacity *= 2; } input += parse_value(input, values + length); if (cudl_err) { - for (i = 0; i < length; i++) { + for (i = 0; i < length; i++) cudl_deinit_value(values[i]); - } free(values); return 0; } @@ -141,11 +362,130 @@ static size_t parse_array(char *input, struct cudl_value *value) { } } +static size_t parse_map_key(char *input, char **key) { + char *original_input; + switch (*input) { + case '\0': + cudl_err = CUDL_ERR_EXPECTED_MAP_KEY; + return 0; + case '"': + input++; + return parse_quoted_string(input, key) + 1; + default: + original_input = input; + while (IS_KEY_CHAR(*input)) + input++; + if (input == original_input) { + cudl_err = CUDL_ERR_EXPECTED_MAP_KEY; + return 0; + } + if ((*key = malloc(input - original_input + 1)) == NULL) { + cudl_err = CUDL_ERR_OUT_OF_MEMORY; + return 0; + } + memcpy(*key, original_input, input - original_input); + (*key)[input - original_input] = '\0'; + return input - original_input; + } +} + +static size_t parse_map(char *input, struct cudl_value *value, char end_char) { + printf("Parsing a map from: %s\n", input); + char *original_input; + int i; + struct cudl_map_field *fields, *newfields; + size_t length, capacity; + + original_input = input; + value->tag = CUDL_TAG_MAP; + length = 0; + capacity = 8; + if ((fields = malloc(capacity * sizeof(struct cudl_map_field))) == NULL) { + cudl_err = CUDL_ERR_OUT_OF_MEMORY; + return 0; + } + + STRIP_WHITESPACE(input); + for (;;) { + printf("Parsing a field from: %s\n", input); + if (*input == end_char) { + input++; + fields = realloc(fields, length * sizeof(struct cudl_map_field)); + value->data.map.length = length; + value->data.map.fields = fields; + return input - original_input; + } + if (*input == '\0') { + cudl_err = CUDL_ERR_UNMATCHED_BRACE; + for (i = 0; i < length; i++) { + cudl_deinit_value(fields[i].value); + free(fields[i].key); + } + free(fields); + return 0; + } + if (length >= capacity) { + if ((newfields = realloc(fields, 2 * capacity * sizeof(struct cudl_map_field))) == NULL) { + cudl_err = CUDL_ERR_OUT_OF_MEMORY; + for (i = 0; i < length; i++) { + cudl_deinit_value(fields[i].value); + free(fields[i].key); + } + free(fields); + return 0; + } + fields = newfields; + capacity *= 2; + } + printf("Parsing a key from: %s\n", input); + input += parse_map_key(input, &fields[length].key); + if (cudl_err) { + for (i = 0; i < length; i++) { + cudl_deinit_value(fields[i].value); + free(fields[i].key); + } + free(fields); + return 0; + } + STRIP_WHITESPACE(input); + if (*input != ':') { + cudl_err = CUDL_ERR_EXPECTED_COLON; + for (i = 0; i < length; i++) { + cudl_deinit_value(fields[i].value); + free(fields[i].key); + } + free(fields[length].key); + free(fields); + return 0; + } + input++; + STRIP_WHITESPACE(input); + printf("Parsing a field value from: %s\n", input); + input += parse_value(input, &fields[length].value); + if (cudl_err) { + for (i = 0; i < length; i++) { + cudl_deinit_value(fields[i].value); + free(fields[i].key); + } + free(fields[length].key); + free(fields); + return 0; + } + length++; + } +} + static size_t _parse_value(char *input, struct cudl_value *value) { if (*input == '%') return parse_bool_or_null(++input, value) + 1; if (*input == '[') return parse_array(++input, value) + 1; + if (*input == '{') + return parse_map(++input, value, '}') + 1; + if (*input == '"') { + value->tag = CUDL_TAG_STRING; + return parse_quoted_string(++input, &value->data.string) + 1; + } cudl_err = CUDL_ERR_UNRECOGNISED_VALUE; return 0; } @@ -31,16 +31,24 @@ enum { CUDL_TAG_NULL, CUDL_TAG_BOOL, CUDL_TAG_ARRAY, + CUDL_TAG_MAP, + CUDL_TAG_STRING, }; enum { CUDL_OK = 0, CUDL_ERR_OUT_OF_MEMORY, - CUDL_ERR_EXPECTED_VALUE, CUDL_ERR_READING, + CUDL_ERR_EXPECTED_VALUE, CUDL_ERR_EXPECTED_BOOL_OR_NULL, + CUDL_ERR_EXPECTED_ESCAPE_SEQUENCE, + CUDL_ERR_EXPECTED_MAP_KEY, + CUDL_ERR_EXPECTED_COLON, CUDL_ERR_UNMATCHED_BRACK, + CUDL_ERR_UNMATCHED_BRACE, + CUDL_ERR_UNMATCHED_QUOTE, CUDL_ERR_UNRECOGNISED_VALUE, + CUDL_ERR_UNRECOGNISED_UNICODE, }; extern int cudl_err; @@ -1,4 +1,4 @@ -# Spec attempt 5 +# CUDL - Clear and Unmistakable Data Language * Every file contains 1 value, which may have other values nested inside it. * A schema can be provided when a file is parsed which gives it's value a type. @@ -17,7 +17,8 @@ An inline-end character is one of the following: ## Map A sequence of key:value pairs. No delimeter is needed as every value will have a ending marker. -If a key starts with a quote then it continues until another quote ends it. Quotes can be escaped by using 2 of them. +The first key may be preceeded by whitespace and whitespace can occur before or after the : between the key and value. +If a key starts with a quote then it obeys the same rules as a quoted string. Otherwise a key must match [A-Za-z0-9_-]+ A map can be preceeded by a { and succeeded by a } @@ -45,6 +46,7 @@ The following escape sequences are available for quoted strings and keys: \" - quote \\ - backslash \uXXXX - unicode XXXX +\UXXXXXXXX - unicode XXXXXXXX ``` ### Multiline string @@ -2,4 +2,11 @@ %true %false [%null %null %false] + ["hello\nfriend\t\tstuff\"" "world"] + "\U0001f600" + { + testing: "this is a test map" + i_hope_it_works: {nesting: "nested maps!!!"} + "here's an interesting key :D": %null + } ] |