<- Back to shtanton's homepage
aboutsummaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorCharlie Stanton <charlie@shtanton.xyz>2021-10-04 14:11:29 +0100
committerCharlie Stanton <charlie@shtanton.xyz>2021-10-04 14:11:29 +0100
commit14e929a7596baa7ee9da401975c521aa0e93c3b4 (patch)
tree90459c8dd3b5756d2da409e8dbfdd19420e7e882
parentd973b4b8eaaa5e7f1d10f9f02069c6d7bde25dcc (diff)
downloadcudl-14e929a7596baa7ee9da401975c521aa0e93c3b4.tar
Add string and map parsing. Currently no support for the schema variants of the syntax for these types
-rw-r--r--cudl.c346
-rw-r--r--cudl.h10
-rw-r--r--spec.txt6
-rw-r--r--test.cudl7
4 files changed, 363 insertions, 6 deletions
diff --git a/cudl.c b/cudl.c
index fc75bb6..57a57ec 100644
--- a/cudl.c
+++ b/cudl.c
@@ -4,7 +4,13 @@
#include <string.h>
#include "cudl.h"
-#define STRIP_WHITESPACE(text) while (isspace(*text)) text++
+#define STRIP_WHITESPACE(text) while (isspace(*(text))) (text)++
+#define IS_KEY_CHAR(c) (\
+ 'a' <= (c) && (c) <= 'z' ||\
+ 'A' <= (c) && (c) <= 'Z' ||\
+ '0' <= (c) && (c) <= '9' ||\
+ (c) == '_' || (c) == '-'\
+)
int cudl_err = CUDL_OK;
@@ -37,12 +43,23 @@ void cudl_debug(struct cudl_value value) {
else
printf("%%false");
break;
+ case CUDL_TAG_STRING:
+ printf("\"%s\"", value.data.string);
+ break;;
case CUDL_TAG_ARRAY:
printf("[");
for (i = 0; i < value.data.array.length; i++)
cudl_debug(value.data.array.values[i]);
printf("]");
break;
+ case CUDL_TAG_MAP:
+ printf("{");
+ for (i = 0; i < value.data.map.length; i++) {
+ printf("\"%s\": ", value.data.map.fields[i].key);
+ cudl_debug(value.data.map.fields[i].value);
+ }
+ printf("}");
+ break;
default:
printf("UNKNOWN");
break;
@@ -89,6 +106,210 @@ static size_t parse_bool_or_null(char *input, struct cudl_value *value) {
return 0;
}
+/* Convert UCS character to utf-8 bytes.
+ * Return number of bytes generated.
+ * Sets cudl_error on error.
+ * Shamelessly lifted from https://github.com/cktan/tomc99 */
+size_t cudl_ucs_to_utf8(int64_t ucs, char utf8[6]) {
+ if (
+ 0xd800 <= ucs && ucs <= 0xdfff ||
+ 0xfffe <= ucs && ucs <= 0xffff ||
+ ucs < 0
+ ) {
+ cudl_err = CUDL_ERR_UNRECOGNISED_UNICODE;
+ return 0;
+ }
+
+ /* 0x00000000 - 0x0000007F:
+ 0xxxxxxx
+ */
+ if (ucs <= 0x7F) {
+ utf8[0] = (unsigned char) ucs;
+ return 1;
+ }
+
+ /* 0x00000080 - 0x000007FF:
+ 110xxxxx 10xxxxxx
+ */
+ if (ucs <= 0x000007FF) {
+ utf8[0] = 0xc0 | (ucs >> 6);
+ utf8[1] = 0x80 | (ucs & 0x3f);
+ return 2;
+ }
+
+ /* 0x00000800 - 0x0000FFFF:
+ 1110xxxx 10xxxxxx 10xxxxxx
+ */
+ if (ucs <= 0x0000FFFF) {
+ utf8[0] = 0xe0 | (ucs >> 12);
+ utf8[1] = 0x80 | ((ucs >> 6) & 0x3f);
+ utf8[2] = 0x80 | (ucs & 0x3f);
+ return 3;
+ }
+
+ /* 0x00010000 - 0x001FFFFF:
+ 11110xxx 10xxxxxx 10xxxxxx 10xxxxxx
+ */
+ if (ucs <= 0x001FFFFF) {
+ utf8[0] = 0xf0 | (ucs >> 18);
+ utf8[1] = 0x80 | ((ucs >> 12) & 0x3f);
+ utf8[2] = 0x80 | ((ucs >> 6) & 0x3f);
+ utf8[3] = 0x80 | (ucs & 0x3f);
+ return 4;
+ }
+
+ /* 0x00200000 - 0x03FFFFFF:
+ 111110xx 10xxxxxx 10xxxxxx 10xxxxxx 10xxxxxx
+ */
+ if (ucs <= 0x03FFFFFF) {
+ utf8[0] = 0xf8 | (ucs >> 24);
+ utf8[1] = 0x80 | ((ucs >> 18) & 0x3f);
+ utf8[2] = 0x80 | ((ucs >> 12) & 0x3f);
+ utf8[3] = 0x80 | ((ucs >> 6) & 0x3f);
+ utf8[4] = 0x80 | (ucs & 0x3f);
+ return 5;
+ }
+
+ /* 0x04000000 - 0x7FFFFFFF:
+ 1111110x 10xxxxxx 10xxxxxx 10xxxxxx 10xxxxxx 10xxxxxx
+ */
+ if (ucs <= 0x7FFFFFFF) {
+ utf8[0] = 0xfc | (ucs >> 30);
+ utf8[1] = 0x80 | ((ucs >> 24) & 0x3f);
+ utf8[2] = 0x80 | ((ucs >> 18) & 0x3f);
+ utf8[3] = 0x80 | ((ucs >> 12) & 0x3f);
+ utf8[4] = 0x80 | ((ucs >> 6) & 0x3f);
+ utf8[5] = 0x80 | (ucs & 0x3f);
+ return 6;
+ }
+
+ cudl_err = CUDL_ERR_UNRECOGNISED_UNICODE;
+ return 0;
+}
+
+/* Parse a string starting after the opening quote.
+ * Set string to be the contents of the string.
+ * No memory is allocated if an error occurs. */
+static size_t parse_quoted_string(char *input, char **string) {
+ size_t length, capacity;
+ char *original_input, *newstring;
+ int64_t ucs;
+ int ucs_length, i;
+
+ length = 0;
+ capacity = 32;
+ original_input = input;
+ if ((*string = malloc(capacity)) == NULL) {
+ cudl_err = CUDL_ERR_OUT_OF_MEMORY;
+ return 0;
+ }
+ for (;;) {
+ if (*input == '\0') {
+ cudl_err = CUDL_ERR_UNMATCHED_QUOTE;
+ free(*string);
+ return 0;
+ }
+ if (*input == '"') {
+ if ((newstring = realloc(*string, length + 1)) == NULL) {
+ cudl_err = CUDL_ERR_OUT_OF_MEMORY;
+ free(*string);
+ return 0;
+ }
+ *string = newstring;
+ (*string)[length] = '\0';
+ input++;
+ return input - original_input;
+ }
+ if (length >= capacity) {
+ if ((newstring = realloc(*string, capacity * 2)) == NULL) {
+ cudl_err = CUDL_ERR_OUT_OF_MEMORY;
+ free(*string);
+ return 0;
+ }
+ *string = newstring;
+ capacity *= 2;
+ }
+ if (*input == '\\') {
+ input++;
+ switch (*input) {
+ case '\0':
+ cudl_err = CUDL_ERR_EXPECTED_ESCAPE_SEQUENCE;
+ free(*string);
+ return 0;
+ case 'b':
+ (*string)[length++] = '\b';
+ input++;
+ break;
+ case 't':
+ (*string)[length++] = '\t';
+ input++;
+ break;
+ case 'n':
+ (*string)[length++] = '\n';
+ input++;
+ break;
+ case 'r':
+ (*string)[length++] = '\r';
+ input++;
+ break;
+ case '"':
+ (*string)[length++] = '"';
+ input++;
+ break;
+ case '\\':
+ (*string)[length++] = '\\';
+ input++;
+ break;
+ case 'u':
+ case 'U':
+ ucs = 0;
+ ucs_length = (*input == 'u') ? 4 : 8;
+ input++;
+ for (i = 0; i < ucs_length; i++) {
+ if (input[i] == '\0') {
+ cudl_err = CUDL_ERR_EXPECTED_ESCAPE_SEQUENCE;
+ free(*string);
+ return 0;
+ }
+ if ('0' <= input[i] && input[i] <= '9') {
+ ucs = (ucs << 4) + (input[i] - '0');
+ } else if ('a' <= input[i] && input[i] <= 'z') {
+ ucs = (ucs << 4) + (input[i] - 'a' + 10);
+ } else if ('A' <= input[i] && input[i] <= 'Z') {
+ ucs = (ucs << 4) + (input[i] - 'A' + 10);
+ } else {
+ cudl_err = CUDL_ERR_EXPECTED_ESCAPE_SEQUENCE;
+ free(*string);
+ return 0;
+ }
+ }
+ if (length + 6 > capacity) {
+ if ((newstring = realloc(*string, capacity * 2)) == NULL) {
+ cudl_err = CUDL_ERR_OUT_OF_MEMORY;
+ free(*string);
+ return 0;
+ }
+ *string = newstring;
+ capacity *= 2;
+ }
+ length += cudl_ucs_to_utf8(ucs, (*string) + length);
+ if (cudl_err) {
+ free(*string);
+ return 0;
+ }
+ input += ucs_length;
+ break;
+ default:
+ (*string)[length++] = *input;
+ input++;
+ break;
+ }
+ } else {
+ (*string)[length++] = *(input++);
+ }
+ }
+}
+
static size_t parse_array(char *input, struct cudl_value *value) {
size_t length, capacity;
struct cudl_value *values, *newvalues;
@@ -127,13 +348,13 @@ static size_t parse_array(char *input, struct cudl_value *value) {
free(values);
return 0;
}
+ values = newvalues;
capacity *= 2;
}
input += parse_value(input, values + length);
if (cudl_err) {
- for (i = 0; i < length; i++) {
+ for (i = 0; i < length; i++)
cudl_deinit_value(values[i]);
- }
free(values);
return 0;
}
@@ -141,11 +362,130 @@ static size_t parse_array(char *input, struct cudl_value *value) {
}
}
+static size_t parse_map_key(char *input, char **key) {
+ char *original_input;
+ switch (*input) {
+ case '\0':
+ cudl_err = CUDL_ERR_EXPECTED_MAP_KEY;
+ return 0;
+ case '"':
+ input++;
+ return parse_quoted_string(input, key) + 1;
+ default:
+ original_input = input;
+ while (IS_KEY_CHAR(*input))
+ input++;
+ if (input == original_input) {
+ cudl_err = CUDL_ERR_EXPECTED_MAP_KEY;
+ return 0;
+ }
+ if ((*key = malloc(input - original_input + 1)) == NULL) {
+ cudl_err = CUDL_ERR_OUT_OF_MEMORY;
+ return 0;
+ }
+ memcpy(*key, original_input, input - original_input);
+ (*key)[input - original_input] = '\0';
+ return input - original_input;
+ }
+}
+
+static size_t parse_map(char *input, struct cudl_value *value, char end_char) {
+ printf("Parsing a map from: %s\n", input);
+ char *original_input;
+ int i;
+ struct cudl_map_field *fields, *newfields;
+ size_t length, capacity;
+
+ original_input = input;
+ value->tag = CUDL_TAG_MAP;
+ length = 0;
+ capacity = 8;
+ if ((fields = malloc(capacity * sizeof(struct cudl_map_field))) == NULL) {
+ cudl_err = CUDL_ERR_OUT_OF_MEMORY;
+ return 0;
+ }
+
+ STRIP_WHITESPACE(input);
+ for (;;) {
+ printf("Parsing a field from: %s\n", input);
+ if (*input == end_char) {
+ input++;
+ fields = realloc(fields, length * sizeof(struct cudl_map_field));
+ value->data.map.length = length;
+ value->data.map.fields = fields;
+ return input - original_input;
+ }
+ if (*input == '\0') {
+ cudl_err = CUDL_ERR_UNMATCHED_BRACE;
+ for (i = 0; i < length; i++) {
+ cudl_deinit_value(fields[i].value);
+ free(fields[i].key);
+ }
+ free(fields);
+ return 0;
+ }
+ if (length >= capacity) {
+ if ((newfields = realloc(fields, 2 * capacity * sizeof(struct cudl_map_field))) == NULL) {
+ cudl_err = CUDL_ERR_OUT_OF_MEMORY;
+ for (i = 0; i < length; i++) {
+ cudl_deinit_value(fields[i].value);
+ free(fields[i].key);
+ }
+ free(fields);
+ return 0;
+ }
+ fields = newfields;
+ capacity *= 2;
+ }
+ printf("Parsing a key from: %s\n", input);
+ input += parse_map_key(input, &fields[length].key);
+ if (cudl_err) {
+ for (i = 0; i < length; i++) {
+ cudl_deinit_value(fields[i].value);
+ free(fields[i].key);
+ }
+ free(fields);
+ return 0;
+ }
+ STRIP_WHITESPACE(input);
+ if (*input != ':') {
+ cudl_err = CUDL_ERR_EXPECTED_COLON;
+ for (i = 0; i < length; i++) {
+ cudl_deinit_value(fields[i].value);
+ free(fields[i].key);
+ }
+ free(fields[length].key);
+ free(fields);
+ return 0;
+ }
+ input++;
+ STRIP_WHITESPACE(input);
+ printf("Parsing a field value from: %s\n", input);
+ input += parse_value(input, &fields[length].value);
+ if (cudl_err) {
+ for (i = 0; i < length; i++) {
+ cudl_deinit_value(fields[i].value);
+ free(fields[i].key);
+ }
+ free(fields[length].key);
+ free(fields);
+ return 0;
+ }
+ length++;
+ }
+}
+
static size_t _parse_value(char *input, struct cudl_value *value) {
if (*input == '%')
return parse_bool_or_null(++input, value) + 1;
if (*input == '[')
return parse_array(++input, value) + 1;
+ if (*input == '{')
+ return parse_map(++input, value, '}') + 1;
+ if (*input == '"') {
+ value->tag = CUDL_TAG_STRING;
+ return parse_quoted_string(++input, &value->data.string) + 1;
+ }
cudl_err = CUDL_ERR_UNRECOGNISED_VALUE;
return 0;
}
diff --git a/cudl.h b/cudl.h
index a96cc45..df69632 100644
--- a/cudl.h
+++ b/cudl.h
@@ -31,16 +31,24 @@ enum {
CUDL_TAG_NULL,
CUDL_TAG_BOOL,
CUDL_TAG_ARRAY,
+ CUDL_TAG_MAP,
+ CUDL_TAG_STRING,
};
enum {
CUDL_OK = 0,
CUDL_ERR_OUT_OF_MEMORY,
- CUDL_ERR_EXPECTED_VALUE,
CUDL_ERR_READING,
+ CUDL_ERR_EXPECTED_VALUE,
CUDL_ERR_EXPECTED_BOOL_OR_NULL,
+ CUDL_ERR_EXPECTED_ESCAPE_SEQUENCE,
+ CUDL_ERR_EXPECTED_MAP_KEY,
+ CUDL_ERR_EXPECTED_COLON,
CUDL_ERR_UNMATCHED_BRACK,
+ CUDL_ERR_UNMATCHED_BRACE,
+ CUDL_ERR_UNMATCHED_QUOTE,
CUDL_ERR_UNRECOGNISED_VALUE,
+ CUDL_ERR_UNRECOGNISED_UNICODE,
};
extern int cudl_err;
diff --git a/spec.txt b/spec.txt
index 6070415..b144485 100644
--- a/spec.txt
+++ b/spec.txt
@@ -1,4 +1,4 @@
-# Spec attempt 5
+# CUDL - Clear and Unmistakable Data Language
* Every file contains 1 value, which may have other values nested inside it.
* A schema can be provided when a file is parsed which gives it's value a type.
@@ -17,7 +17,8 @@ An inline-end character is one of the following:
## Map
A sequence of key:value pairs. No delimeter is needed as every value will have a ending marker.
-If a key starts with a quote then it continues until another quote ends it. Quotes can be escaped by using 2 of them.
+The first key may be preceeded by whitespace and whitespace can occur before or after the : between the key and value.
+If a key starts with a quote then it obeys the same rules as a quoted string.
Otherwise a key must match [A-Za-z0-9_-]+
A map can be preceeded by a { and succeeded by a }
@@ -45,6 +46,7 @@ The following escape sequences are available for quoted strings and keys:
\" - quote
\\ - backslash
\uXXXX - unicode XXXX
+\UXXXXXXXX - unicode XXXXXXXX
```
### Multiline string
diff --git a/test.cudl b/test.cudl
index 150217a..a9de397 100644
--- a/test.cudl
+++ b/test.cudl
@@ -2,4 +2,11 @@
%true
%false
[%null %null %false]
+ ["hello\nfriend\t\tstuff\"" "world"]
+ "\U0001f600"
+ {
+ testing: "this is a test map"
+ i_hope_it_works: {nesting: "nested maps!!!"}
+ "here's an interesting key :D": %null
+ }
]