Initial commit for recording parse state in ast

This commit is contained in:
Daniel Holden
2014-04-15 16:04:07 +01:00
parent efabc31c06
commit 0164931350
5 changed files with 118 additions and 45 deletions

View File

@@ -1,6 +1,8 @@
Micro Parser Combinators
========================
Version 0.8
_mpc_ is a lightweight and powerful Parser Combinator library for C.
Using _mpc_ might be of interest to you if you are...
@@ -73,24 +75,24 @@ mpc_cleanup(4, Expr, Prod, Value, Maths);
If you were to set `input` to the string `(4 * 2 * 11 + 2) - 5`, the printed output would look like this.
```
>:
regex:
expression|>:
value|>:
char: '('
expression|>:
product|>:
value|regex: '4'
char: '*'
value|regex: '2'
char: '*'
value|regex: '11'
char: '+'
product|value|regex: '2'
char: ')'
char: '-'
product|value|regex: '5'
regex:
>
regex
expression|>
value|>
char:1:1 '('
expression|>
product|>
value|regex:1:2 '4'
char:1:4 '*'
value|regex:1:6 '2'
char:1:8 '*'
value|regex:1:10 '11'
char:1:13 '+'
product|value|regex:1:15 '2'
char:1:16 ')'
char:1:18 '-'
product|value|regex:1:20 '5'
regex
```
Getting Started
@@ -195,7 +197,7 @@ Consumes no input, always fails with message `m`.
mpc_parser_t *mpc_failf(const char *fmt, ...);
```
Consumes no input, always fails with formatted message given by `fmt` and following parameters.
Consumes no input, always fails with string formatted message given by `fmt` and following parameters.
* * *
@@ -213,6 +215,14 @@ mpc_parser_t *mpc_lift_val(mpc_val_t *x);
Consumes no input, always successful, returns `x`
* * *
```c
mpc_parser_t *mpc_state(void);
```
Consumes no input, always successful, returns a copy of the parser state as `mpc_state_t *`. This pointer needs to be freed with `free` when done with.
Parsing
-------
@@ -683,13 +693,9 @@ It also allows for one more trick. As all the fold and destructor functions are
```
number "number" : /[0-9]+/ ;
expression : <product> (('+' | '-') <product>)* ;
product : <value> (('*' | '/') <value>)* ;
value : <number> | '(' <expression> ')' ;
maths : /^/ <expression> /$/ ;
```
@@ -748,7 +754,7 @@ Limitations & FAQ
### Does this support Unicode?
_mpc_ Only supports ASCII. Sorry! I welcome contributions as making the library support Unicode is non-trivial.
_mpc_ Only supports ASCII. Sorry! Writing a parser library that supports Unicode is pretty difficult. I welcome contributions!
### Backtracking and Left Recursion
@@ -769,7 +775,7 @@ factor : <ident> '(' <expr>? (',' <expr>)* ')'
| <ident> ;
```
An alternative, and better option is to remove the ambiguity by factoring out the first identifier completely. This is better because it removes any need for backtracking at all!
An alternative, and better option is to remove the ambiguity by factoring out the first identifier completely. This is better because it removes any need for backtracking at all! Now the grammar is predictive!
```
factor : <ident> ('(' <expr>? (',' <expr>)* ')')? ;
@@ -783,5 +789,9 @@ Some compilers limit the maximum length of string literals. If you have a huge l
There are a couple of ways to overcome this issue if it arises. You could instead use `mpca_lang_contents` and load the language from file or you could use a string literal for each line and let the preprocessor automatically concatenate them together, avoiding the limit. The final option is to upgrade your compiler. In C99 this limit has been increased to 4095.
### The string tag is annoying.
When parsing from a grammar, the abstract syntax tree is tagged with different tags for each primitive type it encounters. For example a regular expression will be automatically tagged as `regex`. Character literals as `char` and strings as `string`.
If you have a rule in your grammar called `string`, `char` or `regex`, you may encounter some confusion. This is because nodes will be tagged with (for example) `string` _either_ if they are a string primitive, _or_ if they were parsed via your `string` rule. If you are detecting node type using something like `strstr`, in this situation it might break. One solution to this is to always check that `string` is the innermost tag to test for string primitives, or to rename your rule called `string` to something that doesn't conflict.

1
examples/readme.maths Normal file
View File

@@ -0,0 +1 @@
(4 * 2 * 11 + 2) - 5

85
mpc.c
View File

@@ -509,9 +509,36 @@ static int mpc_input_failure(mpc_input_t *i, char c) {
}
static char mpc_input_peekc(mpc_input_t *i) {
char c = mpc_input_getc(i);
mpc_input_failure(i, c);
char c;
switch (i->type) {
case MPC_INPUT_STRING: return i->string[i->state.pos];
case MPC_INPUT_FILE:
if (feof(i->file)) { return '\0'; }
c = fgetc(i->file);
fseek(i->file, -1, SEEK_CUR);
break;
case MPC_INPUT_PIPE:
if (feof(i->file)) { return '\0'; }
if (!i->buffer) { c = getc(i->file); ungetc(c, i->file); break; }
if (i->buffer && mpc_input_buffer_in_range(i)) {
return mpc_input_buffer_get(i);
} else {
c = getc(i->file); ungetc(c, i->file);
break;
}
}
return c;
}
static int mpc_input_success(mpc_input_t *i, char c, char **o) {
@@ -2350,6 +2377,16 @@ void mpc_print(mpc_parser_t *p) {
** Testing
*/
/*
** These functions are slightly unwieldy and
** also the whole of the testing suite for mpc
** mpc is pretty shaky.
**
** It could do with a lot more tests and more
** precision. Currently I am only really testing
** changes off of the examples.
**
*/
int mpc_unmatch(mpc_parser_t *p, const char *s, void *d,
int(*tester)(void*, void*),
@@ -2439,6 +2476,8 @@ mpc_ast_t *mpc_ast_new(const char *tag, const char *contents) {
a->contents = malloc(strlen(contents) + 1);
strcpy(a->contents, contents);
a->state = mpc_state_new();
a->children_num = 0;
a->children = NULL;
return a;
@@ -2513,15 +2552,20 @@ mpc_ast_t *mpc_ast_tag(mpc_ast_t *a, const char *t) {
return a;
}
mpc_ast_t *mpc_ast_state(mpc_ast_t *a, mpc_state_t s) {
a->state = s;
return a;
}
static void mpc_ast_print_depth(mpc_ast_t *a, int d) {
int i;
for (i = 0; i < d; i++) { printf(" "); }
if (strlen(a->contents)) {
printf("%s: '%s'\n", a->tag, a->contents);
printf("%s:%i:%i '%s'\n", a->tag, a->state.row+1, a->state.col+1, a->contents);
} else {
printf("%s:\n", a->tag);
printf("%s \n", a->tag);
}
for (i = 0; i < a->children_num; i++) {
@@ -2538,7 +2582,7 @@ mpc_val_t *mpcf_fold_ast(int n, mpc_val_t **xs) {
int i, j;
mpc_ast_t** as = (mpc_ast_t**)xs;
mpc_val_t *r;
mpc_ast_t *r;
if (n == 0) { return NULL; }
if (n == 1) { return xs[0]; }
@@ -2551,11 +2595,6 @@ mpc_val_t *mpcf_fold_ast(int n, mpc_val_t **xs) {
if (as[i] == NULL) { continue; }
/*
printf("%i\n", i);
mpc_ast_print(as[i]);
*/
if (as[i] && as[i]->children_num > 0) {
for (j = 0; j < as[i]->children_num; j++) {
@@ -2570,6 +2609,10 @@ mpc_val_t *mpcf_fold_ast(int n, mpc_val_t **xs) {
}
if (r->children_num) {
r->state = r->children[0]->state;
}
return r;
}
@@ -2579,6 +2622,18 @@ mpc_val_t *mpcf_str_ast(mpc_val_t *c) {
return a;
}
mpc_val_t *mpcf_state_ast(int n, mpc_val_t **xs) {
mpc_state_t *s = ((mpc_state_t**)xs)[0];
mpc_ast_t *a = ((mpc_ast_t**)xs)[1];
a = mpc_ast_state(a, *s);
free(s);
return a;
}
mpc_parser_t *mpca_state(mpc_parser_t *a) {
return mpc_and(2, mpcf_state_ast, mpc_state(), a, free);
}
mpc_parser_t *mpca_tag(mpc_parser_t *a, const char *t) {
return mpc_apply_to(a, (mpc_apply_to_t)mpc_ast_tag, (void*)t);
}
@@ -2728,7 +2783,7 @@ static mpc_val_t *mpcaf_grammar_string(mpc_val_t *x, void *s) {
char *y = mpcf_unescape(x);
mpc_parser_t *p = (st->flags & MPC_LANG_WHITESPACE_SENSITIVE) ? mpc_string(y) : mpc_tok(mpc_string(y));
free(y);
return mpca_tag(mpc_apply(p, mpcf_str_ast), "string");
return mpca_state(mpca_tag(mpc_apply(p, mpcf_str_ast), "string"));
}
static mpc_val_t *mpcaf_grammar_char(mpc_val_t *x, void *s) {
@@ -2736,7 +2791,7 @@ static mpc_val_t *mpcaf_grammar_char(mpc_val_t *x, void *s) {
char *y = mpcf_unescape(x);
mpc_parser_t *p = (st->flags & MPC_LANG_WHITESPACE_SENSITIVE) ? mpc_char(y[0]) : mpc_tok(mpc_char(y[0]));
free(y);
return mpca_tag(mpc_apply(p, mpcf_str_ast), "char");
return mpca_state(mpca_tag(mpc_apply(p, mpcf_str_ast), "char"));
}
static mpc_val_t *mpcaf_grammar_regex(mpc_val_t *x, void *s) {
@@ -2744,7 +2799,7 @@ static mpc_val_t *mpcaf_grammar_regex(mpc_val_t *x, void *s) {
char *y = mpcf_unescape_regex(x);
mpc_parser_t *p = (st->flags & MPC_LANG_WHITESPACE_SENSITIVE) ? mpc_re(y) : mpc_tok(mpc_re(y));
free(y);
return mpca_tag(mpc_apply(p, mpcf_str_ast), "regex");
return mpca_state(mpca_tag(mpc_apply(p, mpcf_str_ast), "regex"));
}
static int is_number(const char* s) {
@@ -2811,9 +2866,9 @@ static mpc_val_t *mpcaf_grammar_id(mpc_val_t *x, void *s) {
free(x);
if (p->name) {
return mpca_root(mpca_add_tag(p, p->name));
return mpca_state(mpca_root(mpca_add_tag(p, p->name)));
} else {
return mpca_root(p);
return mpca_state(mpca_root(p));
}
}

7
mpc.h
View File

@@ -243,6 +243,7 @@ mpc_parser_t *mpc_re(const char *re);
typedef struct mpc_ast_t {
char *tag;
char *contents;
mpc_state_t state;
int children_num;
struct mpc_ast_t** children;
} mpc_ast_t;
@@ -253,18 +254,24 @@ mpc_ast_t *mpc_ast_add_root(mpc_ast_t *a);
mpc_ast_t *mpc_ast_add_child(mpc_ast_t *r, mpc_ast_t *a);
mpc_ast_t *mpc_ast_add_tag(mpc_ast_t *a, const char *t);
mpc_ast_t *mpc_ast_tag(mpc_ast_t *a, const char *t);
mpc_ast_t *mpc_ast_state(mpc_ast_t *a, mpc_state_t s);
void mpc_ast_delete(mpc_ast_t *a);
void mpc_ast_print(mpc_ast_t *a);
/*
** Warning: This function currently doesn't test for equality of the `state` member!
*/
int mpc_ast_eq(mpc_ast_t *a, mpc_ast_t *b);
mpc_val_t *mpcf_fold_ast(int n, mpc_val_t **as);
mpc_val_t *mpcf_str_ast(mpc_val_t *c);
mpc_val_t *mpcf_state_ast(int n, mpc_val_t **xs);
mpc_parser_t *mpca_tag(mpc_parser_t *a, const char *t);
mpc_parser_t *mpca_add_tag(mpc_parser_t *a, const char *t);
mpc_parser_t *mpca_root(mpc_parser_t *a);
mpc_parser_t *mpca_state(mpc_parser_t *a);
mpc_parser_t *mpca_total(mpc_parser_t *a);
mpc_parser_t *mpca_not(mpc_parser_t *a);