From 01649313504ff0b153893c4b8bde29f550edcbe1 Mon Sep 17 00:00:00 2001 From: Daniel Holden Date: Tue, 15 Apr 2014 16:04:07 +0100 Subject: [PATCH] Initial commit for recording parse state in ast --- README.md | 68 +++++++++++++++++++--------------- examples/readme.maths | 1 + mpc.c | 85 +++++++++++++++++++++++++++++++++++-------- mpc.h | 7 ++++ tests/grammar.c | 2 +- 5 files changed, 118 insertions(+), 45 deletions(-) create mode 100644 examples/readme.maths diff --git a/README.md b/README.md index 55fd737..ffe9c0f 100644 --- a/README.md +++ b/README.md @@ -1,6 +1,8 @@ Micro Parser Combinators ======================== +Version 0.8 + _mpc_ is a lightweight and powerful Parser Combinator library for C. Using _mpc_ might be of interest to you if you are... @@ -73,24 +75,24 @@ mpc_cleanup(4, Expr, Prod, Value, Maths); If you were to set `input` to the string `(4 * 2 * 11 + 2) - 5`, the printed output would look like this. ``` ->: - regex: - expression|>: - value|>: - char: '(' - expression|>: - product|>: - value|regex: '4' - char: '*' - value|regex: '2' - char: '*' - value|regex: '11' - char: '+' - product|value|regex: '2' - char: ')' - char: '-' - product|value|regex: '5' - regex: +> + regex + expression|> + value|> + char:1:1 '(' + expression|> + product|> + value|regex:1:2 '4' + char:1:4 '*' + value|regex:1:6 '2' + char:1:8 '*' + value|regex:1:10 '11' + char:1:13 '+' + product|value|regex:1:15 '2' + char:1:16 ')' + char:1:18 '-' + product|value|regex:1:20 '5' + regex ``` Getting Started @@ -195,7 +197,7 @@ Consumes no input, always fails with message `m`. mpc_parser_t *mpc_failf(const char *fmt, ...); ``` -Consumes no input, always fails with formatted message given by `fmt` and following parameters. +Consumes no input, always fails with string formatted message given by `fmt` and following parameters. * * * @@ -213,6 +215,14 @@ mpc_parser_t *mpc_lift_val(mpc_val_t *x); Consumes no input, always successful, returns `x` +* * * + +```c +mpc_parser_t *mpc_state(void); +``` + +Consumes no input, always successful, returns a copy of the parser state as `mpc_state_t *`. This pointer needs to be freed with `free` when done with. + Parsing ------- @@ -683,14 +693,10 @@ It also allows for one more trick. As all the fold and destructor functions are ``` number "number" : /[0-9]+/ ; - -expression : (('+' | '-') )* ; - -product : (('*' | '/') )* ; - -value : | '(' ')' ; - -maths : /^/ /$/ ; +expression : (('+' | '-') )* ; +product : (('*' | '/') )* ; +value : | '(' ')' ; +maths : /^/ /$/ ; ``` String literals are surrounded in double quotes `"`. Character literals in single quotes `'` and regex literals in slashes `/`. References to other parsers are surrounded in braces `<>` and referred to by name. @@ -748,7 +754,7 @@ Limitations & FAQ ### Does this support Unicode? -_mpc_ Only supports ASCII. Sorry! I welcome contributions as making the library support Unicode is non-trivial. +_mpc_ Only supports ASCII. Sorry! Writing a parser library that supports Unicode is pretty difficult. I welcome contributions! ### Backtracking and Left Recursion @@ -769,7 +775,7 @@ factor : '(' ? (',' )* ')' | ; ``` -An alternative, and better option is to remove the ambiguity by factoring out the first identifier completely. This is better because it removes any need for backtracking at all! +An alternative, and better option is to remove the ambiguity by factoring out the first identifier completely. This is better because it removes any need for backtracking at all! Now the grammar is predictive! ``` factor : ('(' ? (',' )* ')')? ; @@ -783,5 +789,9 @@ Some compilers limit the maximum length of string literals. If you have a huge l There are a couple of ways to overcome this issue if it arises. You could instead use `mpca_lang_contents` and load the language from file or you could use a string literal for each line and let the preprocessor automatically concatenate them together, avoiding the limit. The final option is to upgrade your compiler. In C99 this limit has been increased to 4095. +### The string tag is annoying. +When parsing from a grammar, the abstract syntax tree is tagged with different tags for each primitive type it encounters. For example a regular expression will be automatically tagged as `regex`. Character literals as `char` and strings as `string`. + +If you have a rule in your grammar called `string`, `char` or `regex`, you may encounter some confusion. This is because nodes will be tagged with (for example) `string` _either_ if they are a string primitive, _or_ if they were parsed via your `string` rule. If you are detecting node type using something like `strstr`, in this situation it might break. One solution to this is to always check that `string` is the innermost tag to test for string primitives, or to rename your rule called `string` to something that doesn't conflict. diff --git a/examples/readme.maths b/examples/readme.maths new file mode 100644 index 0000000..aea9ef3 --- /dev/null +++ b/examples/readme.maths @@ -0,0 +1 @@ +(4 * 2 * 11 + 2) - 5 \ No newline at end of file diff --git a/mpc.c b/mpc.c index 464d655..8a40104 100644 --- a/mpc.c +++ b/mpc.c @@ -509,9 +509,36 @@ static int mpc_input_failure(mpc_input_t *i, char c) { } static char mpc_input_peekc(mpc_input_t *i) { - char c = mpc_input_getc(i); - mpc_input_failure(i, c); + + char c; + + switch (i->type) { + case MPC_INPUT_STRING: return i->string[i->state.pos]; + case MPC_INPUT_FILE: + + if (feof(i->file)) { return '\0'; } + + c = fgetc(i->file); + fseek(i->file, -1, SEEK_CUR); + break; + + case MPC_INPUT_PIPE: + + if (feof(i->file)) { return '\0'; } + + if (!i->buffer) { c = getc(i->file); ungetc(c, i->file); break; } + + if (i->buffer && mpc_input_buffer_in_range(i)) { + return mpc_input_buffer_get(i); + } else { + c = getc(i->file); ungetc(c, i->file); + break; + } + + } + return c; + } static int mpc_input_success(mpc_input_t *i, char c, char **o) { @@ -2350,6 +2377,16 @@ void mpc_print(mpc_parser_t *p) { ** Testing */ +/* +** These functions are slightly unwieldy and +** also the whole of the testing suite for mpc +** mpc is pretty shaky. +** +** It could do with a lot more tests and more +** precision. Currently I am only really testing +** changes off of the examples. +** +*/ int mpc_unmatch(mpc_parser_t *p, const char *s, void *d, int(*tester)(void*, void*), @@ -2439,6 +2476,8 @@ mpc_ast_t *mpc_ast_new(const char *tag, const char *contents) { a->contents = malloc(strlen(contents) + 1); strcpy(a->contents, contents); + a->state = mpc_state_new(); + a->children_num = 0; a->children = NULL; return a; @@ -2513,15 +2552,20 @@ mpc_ast_t *mpc_ast_tag(mpc_ast_t *a, const char *t) { return a; } +mpc_ast_t *mpc_ast_state(mpc_ast_t *a, mpc_state_t s) { + a->state = s; + return a; +} + static void mpc_ast_print_depth(mpc_ast_t *a, int d) { int i; for (i = 0; i < d; i++) { printf(" "); } if (strlen(a->contents)) { - printf("%s: '%s'\n", a->tag, a->contents); + printf("%s:%i:%i '%s'\n", a->tag, a->state.row+1, a->state.col+1, a->contents); } else { - printf("%s:\n", a->tag); + printf("%s \n", a->tag); } for (i = 0; i < a->children_num; i++) { @@ -2538,7 +2582,7 @@ mpc_val_t *mpcf_fold_ast(int n, mpc_val_t **xs) { int i, j; mpc_ast_t** as = (mpc_ast_t**)xs; - mpc_val_t *r; + mpc_ast_t *r; if (n == 0) { return NULL; } if (n == 1) { return xs[0]; } @@ -2551,11 +2595,6 @@ mpc_val_t *mpcf_fold_ast(int n, mpc_val_t **xs) { if (as[i] == NULL) { continue; } - /* - printf("%i\n", i); - mpc_ast_print(as[i]); - */ - if (as[i] && as[i]->children_num > 0) { for (j = 0; j < as[i]->children_num; j++) { @@ -2570,6 +2609,10 @@ mpc_val_t *mpcf_fold_ast(int n, mpc_val_t **xs) { } + if (r->children_num) { + r->state = r->children[0]->state; + } + return r; } @@ -2579,6 +2622,18 @@ mpc_val_t *mpcf_str_ast(mpc_val_t *c) { return a; } +mpc_val_t *mpcf_state_ast(int n, mpc_val_t **xs) { + mpc_state_t *s = ((mpc_state_t**)xs)[0]; + mpc_ast_t *a = ((mpc_ast_t**)xs)[1]; + a = mpc_ast_state(a, *s); + free(s); + return a; +} + +mpc_parser_t *mpca_state(mpc_parser_t *a) { + return mpc_and(2, mpcf_state_ast, mpc_state(), a, free); +} + mpc_parser_t *mpca_tag(mpc_parser_t *a, const char *t) { return mpc_apply_to(a, (mpc_apply_to_t)mpc_ast_tag, (void*)t); } @@ -2728,7 +2783,7 @@ static mpc_val_t *mpcaf_grammar_string(mpc_val_t *x, void *s) { char *y = mpcf_unescape(x); mpc_parser_t *p = (st->flags & MPC_LANG_WHITESPACE_SENSITIVE) ? mpc_string(y) : mpc_tok(mpc_string(y)); free(y); - return mpca_tag(mpc_apply(p, mpcf_str_ast), "string"); + return mpca_state(mpca_tag(mpc_apply(p, mpcf_str_ast), "string")); } static mpc_val_t *mpcaf_grammar_char(mpc_val_t *x, void *s) { @@ -2736,7 +2791,7 @@ static mpc_val_t *mpcaf_grammar_char(mpc_val_t *x, void *s) { char *y = mpcf_unescape(x); mpc_parser_t *p = (st->flags & MPC_LANG_WHITESPACE_SENSITIVE) ? mpc_char(y[0]) : mpc_tok(mpc_char(y[0])); free(y); - return mpca_tag(mpc_apply(p, mpcf_str_ast), "char"); + return mpca_state(mpca_tag(mpc_apply(p, mpcf_str_ast), "char")); } static mpc_val_t *mpcaf_grammar_regex(mpc_val_t *x, void *s) { @@ -2744,7 +2799,7 @@ static mpc_val_t *mpcaf_grammar_regex(mpc_val_t *x, void *s) { char *y = mpcf_unescape_regex(x); mpc_parser_t *p = (st->flags & MPC_LANG_WHITESPACE_SENSITIVE) ? mpc_re(y) : mpc_tok(mpc_re(y)); free(y); - return mpca_tag(mpc_apply(p, mpcf_str_ast), "regex"); + return mpca_state(mpca_tag(mpc_apply(p, mpcf_str_ast), "regex")); } static int is_number(const char* s) { @@ -2811,9 +2866,9 @@ static mpc_val_t *mpcaf_grammar_id(mpc_val_t *x, void *s) { free(x); if (p->name) { - return mpca_root(mpca_add_tag(p, p->name)); + return mpca_state(mpca_root(mpca_add_tag(p, p->name))); } else { - return mpca_root(p); + return mpca_state(mpca_root(p)); } } diff --git a/mpc.h b/mpc.h index 936958a..4e43ef6 100644 --- a/mpc.h +++ b/mpc.h @@ -243,6 +243,7 @@ mpc_parser_t *mpc_re(const char *re); typedef struct mpc_ast_t { char *tag; char *contents; + mpc_state_t state; int children_num; struct mpc_ast_t** children; } mpc_ast_t; @@ -253,18 +254,24 @@ mpc_ast_t *mpc_ast_add_root(mpc_ast_t *a); mpc_ast_t *mpc_ast_add_child(mpc_ast_t *r, mpc_ast_t *a); mpc_ast_t *mpc_ast_add_tag(mpc_ast_t *a, const char *t); mpc_ast_t *mpc_ast_tag(mpc_ast_t *a, const char *t); +mpc_ast_t *mpc_ast_state(mpc_ast_t *a, mpc_state_t s); void mpc_ast_delete(mpc_ast_t *a); void mpc_ast_print(mpc_ast_t *a); +/* +** Warning: This function currently doesn't test for equality of the `state` member! +*/ int mpc_ast_eq(mpc_ast_t *a, mpc_ast_t *b); mpc_val_t *mpcf_fold_ast(int n, mpc_val_t **as); mpc_val_t *mpcf_str_ast(mpc_val_t *c); +mpc_val_t *mpcf_state_ast(int n, mpc_val_t **xs); mpc_parser_t *mpca_tag(mpc_parser_t *a, const char *t); mpc_parser_t *mpca_add_tag(mpc_parser_t *a, const char *t); mpc_parser_t *mpca_root(mpc_parser_t *a); +mpc_parser_t *mpca_state(mpc_parser_t *a); mpc_parser_t *mpca_total(mpc_parser_t *a); mpc_parser_t *mpca_not(mpc_parser_t *a); diff --git a/tests/grammar.c b/tests/grammar.c index 1f75509..1a06d4f 100644 --- a/tests/grammar.c +++ b/tests/grammar.c @@ -52,7 +52,7 @@ void test_grammar(void) { mpc_ast_delete(t0); mpc_ast_delete(t1); mpc_ast_delete(t2); - + mpc_cleanup(4, Expr, Prod, Value, Maths); }