diff --git a/README.md b/README.md index 31d5422..18f421a 100644 --- a/README.md +++ b/README.md @@ -270,12 +270,12 @@ Returns a parser that applies function `f` (optionality taking extra input `x`) * * * ```c -mpc_parser_t* mpc_predict(mpc_parser_t* a); +mpc_parser_t* mpc_predictive(mpc_parser_t* a); ``` Returns a parser that runs `a` with backtracking disabled. This means if `a` consumes any input, it will not be reverted, even on failure. Turning backtracking off has good performance benefits for grammars which are `LL(1)`. These are grammars where the first character completely determines the parse result - such as the decision of parsing either a C identifier, number, or string literal. This option should not be used for non `LL(1)` grammars or it will produce incorrect results or crash the parser. -Another way to think of `mpc_predict` is that it can be applied to a parser (for a performance improvement) if either successfully parsing the first character will result in a completely successful parse, or all of the referenced sub-parsers are also `LL(1)`. +Another way to think of `mpc_predictive` is that it can be applied to a parser (for a performance improvement) if either successfully parsing the first character will result in a completely successful parse, or all of the referenced sub-parsers are also `LL(1)`. * * * diff --git a/TODO.md b/TODO.md index 568874b..0b0557a 100644 --- a/TODO.md +++ b/TODO.md @@ -4,7 +4,4 @@ - Find some good demo grammars to show -- Use while loop instead of recursion in parse function -- Support for stream based input - -- Make RE stuff predictive where possible \ No newline at end of file +- Support for stream based input \ No newline at end of file diff --git a/mpc.c b/mpc.c index 62c4f3c..9250eb7 100644 --- a/mpc.c +++ b/mpc.c @@ -1,7 +1,7 @@ #include "mpc.h" -#ifndef snprintf -int snprintf(char* str, size_t size, const char* fmt, ...) { +#ifndef _WIN32 +static int snprintf(char* str, size_t size, const char* fmt, ...) { int x; va_list va; va_start(va, fmt); @@ -9,10 +9,8 @@ int snprintf(char* str, size_t size, const char* fmt, ...) { va_end(va); return x; } -#endif -#ifndef vsnprintf -int vsnprintf(char* str, size_t size, const char* fmt, va_list args) { +static int vsnprintf(char* str, size_t size, const char* fmt, va_list args) { return vsprintf(str, fmt, args); } #endif @@ -339,77 +337,236 @@ char mpc_err_unexpected(mpc_err_t* x) { ** Input Type */ +/* +** In mpc the input type has three modes of +** operation: String, File and Pipe. +** +** String is easy. The whole contents are +** loaded into a buffer and scanned through. +** The cursor can jump around at will making +** backtracking easy. +** +** The second is a File which is also somewhat +** easy. The contents are never loaded into +** memory but backtracking can still be achieved +** by seeking in the file at different positions. +** +** The final mode is Pipe. This is the difficult +** one. As we assume pipes cannot be seeked - and +** only support a single character lookahead at +** any point where the input is marked for a +** potential backtracking we start buffering any +** input. +** +** This means that if we are requested to seek +** back we can simply start reading from the +** buffer instead of the input. +** +** Of course using `mpc_predictive` will disable +** backtracking and make LL(1) grammars easy +** to parse for all input methods. +** +*/ + +enum { + MPC_INPUT_STRING = 0, + MPC_INPUT_FILE = 1, + MPC_INPUT_PIPE = 2 +}; + typedef struct { char* filename; - char* str; + int type; + mpc_state_t state; + + char* str; + FILE* file; int backtrack; int marks_num; - mpc_state_t* marks; + mpc_state_t* marks_state; + char** marks_buff; } mpc_input_t; -static mpc_input_t* mpc_input_new(const char* filename, const char* str) { +static mpc_input_t* mpc_input_new_string(const char* filename, const char* str) { mpc_input_t* i = malloc(sizeof(mpc_input_t)); + + i->filename = malloc(strlen(filename) + 1); + strcpy(i->filename, filename); + i->type = MPC_INPUT_STRING; + + i->state = mpc_state_null(); + i->str = malloc(strlen(str) + 1); strcpy(i->str, str); + i->buffer = NULL; + i->file = NULL; + + i->backtrack = 1; + i->marks_num = 0; + i->marks_state = NULL; + i->marks_buff = NULL; + + return i; +} + +static mpc_input_t* mpc_input_new_file(const char* filename, FILE* f) { + + mpc_input_t* i = malloc(sizeof(mpc_input_t)); i->filename = malloc(strlen(filename) + 1); strcpy(i->filename, filename); - i->state.next = i->str[0]; - i->state.last = '\0'; - i->state.pos = 0; - i->state.row = 0; - i->state.col = 0; + if (fseek(f, 0, SEEK_CUR) != 0) { + i->type = MPC_INPUT_PIPE; + } else { + i->type = MPC_INPUT_FILE; + } + + i->state = mpc_state_null(); + + i->str = NULL; + i->buffer = NULL; + i->file = f; i->backtrack = 1; i->marks_num = 0; i->marks = NULL; - return i; + } static void mpc_input_delete(mpc_input_t* i) { + + int j; free(i->filename); - free(i->str); - free(i->marks); + + if (i->type == MPC_INPUT_STRING) { free(i->str); } + if (i->type == MPC_INPUT_PIPE) { free(i->buffer); } + + free(i->marks) free(i); } -static void mpc_input_backtrack_disable(mpc_input_t* i) { - i->backtrack = 0; -} - -static void mpc_input_backtrack_enable(mpc_input_t* i) { - i->backtrack = 1; -} +static void mpc_input_backtrack_disable(mpc_input_t* i) { i->backtrack = 0; } +static void mpc_input_backtrack_enable(mpc_input_t* i) { i->backtrack = 1; } static void mpc_input_mark(mpc_input_t* i) { + if (!i->backtrack) { return; } + i->marks_num++; i->marks = realloc(i->marks, sizeof(mpc_state_t) * i->marks_num); i->marks[i->marks_num-1] = i->state; + + if (i->type == MPC_INPUT_PIPE && i->marks_num == 1) { + i->buffer = calloc(1, 1); + } + } static void mpc_input_unmark(mpc_input_t* i) { + if (!i->backtrack) { return; } + i->marks_num--; i->marks = realloc(i->marks, sizeof(mpc_state_t) * i->marks_num); + + if (i->type == MPC_INPUT_PIPE && i->marks_num == 0) { + free(i->buffer); + i->buffer = NULL; + } + } static void mpc_input_rewind(mpc_input_t* i) { + if (!i->backtrack) { return; } + i->state = i->marks[i->marks_num-1]; + + if (i->type == MPC_INPUT_FILE) { + fseek(f, i->state.pos, SEEK_SET); + } + mpc_input_unmark(i); } -static int mpc_input_next(mpc_input_t* i, char** o) { +static int mpc_input_buffer_in_range(mpc_input_t* i) { + return i->state.pos < (strlen(i->buffer) + i->marks[0].state.pos); +} - i->state.last = i->str[i->state.pos]; +static char mpc_input_buffer_get(mpc_input_t* i) { + return i->buffer[i->state.pos - i->marks[0].state.pos]; +} + +static int mpc_input_eoi(mpc_input_t* i) { + if (i->type == MPC_INPUT_STRING && i->state.pos == strlen(i->str)) { return 1; } + if (i->type == MPC_INPUT_FILE && feof(i->file)) { return 1; } + if (i->type == MPC_INPUT_PIPE && feof(i->file)) { return 1; } + return 0; +} + +static int mpc_input_soi(mpc_input_t* i) { + if (i->state.pos == 0) { return 1; } + return 0; +} + +static char mpc_input_getc(mpc_input_t* i) { + + char c; + switch (i->type) { + + case MPC_INPUT_STRING: c = i->str[i->state.pos]; break; + case MPC_INPUT_FILE: c = fgetc(i->file); break; + case MPC_INPUT_PIPE: + + if (!i->buffer) { c = getc(i->file); } + + if (i->buffer && mpc_input_buffer_in_range(i)) { + c = mpc_input_buffer_get(i); + } else { + c = getc(i->file); + } + + break; + + } + + return c; + +} + +static void mpc_input_ungetc(mpc_input_t* i) { + switch (i->type) { + case MPC_INPUT_STRING: break; + case MPC_INPUT_FILE: fseek(i->file, -1, SEEK_CUR); break; + case MPC_INPUT_PIPE: ungetc(i->file, c); break; + } +} + +static int mpc_input_failure(mpc_input_t* i, char c, char** o) { + *o = NULL; + i->state.next = c; + return 0; +} + +static int mpc_input_success(mpc_input_t* i, char c, char** o) { + + if (i->type == MPC_INPUT_PIPE && + i->buffer && + !mpc_input_buffer_in_range(i)) { + + i->buffer = realloc(strlen(i->buffer) + 2); + i->buffer[strlen(i->buffer) + 1] = '\0'; + i->buffer[strlen(i->buffer) + 0] = c; + } + + i->state.last = c; i->state.pos++; i->state.col++; @@ -419,33 +576,26 @@ static int mpc_input_next(mpc_input_t* i, char** o) { } (*o) = malloc(2); - (*o)[0] = i->state.last; + (*o)[0] = c; (*o)[1] = '\0'; return 1; - + } static int mpc_input_any(mpc_input_t* i, char** o) { - - if (i->state.pos >= strlen(i->str)) { i->state.next = '\0'; return 0; } - if (i->str[i->state.pos] == '\0') { - i->state.next = i->str[i->state.pos]; - return 0; - } - - return mpc_input_next(i, o); - + if (mpc_input_eoi(i)) { return mpc_input_failure(i, '\0', o); } + return mpc_input_update(i, mpc_input_getc(i), o); } static int mpc_input_char(mpc_input_t* i, char c, char** o) { - if (i->state.pos >= strlen(i->str)) { i->state.next = '\0'; return 0; } - if (i->str[i->state.pos] != c) { - i->state.next = i->str[i->state.pos]; - return 0; - } + char x; + if (mpc_input_eoi(i)) { return mpc_input_failure(i, '\0', o); } - return mpc_input_next(i, o); + x = mpc_input_getc(i); + if (x != c) { mpc_input_ungetc(i); return mpc_input_failure(i, x, o); } + + return mpc_input_success(i, x, o); } @@ -659,8 +809,8 @@ static void mpc_stack_delete(mpc_stack_t* s) { static int mpc_stack_terminate(mpc_stack_t* s, mpc_result_t* r) { int ret; - if (s->parsers_num != 0) { fprintf(stderr, "Still Parsers on stack!\n"); abort(); } - if (s->results_num != 1) { fprintf(stderr, "Still Results on stack!\n"); abort(); } + if (s->parsers_num != 0) { fprintf(stderr, "Fatal Error: Still Parsers on stack!\n"); abort(); } + if (s->results_num != 1) { fprintf(stderr, "Fatal Error: Still Results on stack!\n"); abort(); } *r = s->results[0]; ret = s->returns[0]; mpc_stack_delete(s); @@ -844,9 +994,10 @@ static mpc_err_t* mpc_stack_merger_err(mpc_stack_t* s, int n) { ** When this function was written in recursive form ** it looked pretty nice. But I've since switched ** it around to an akward while loop. It was an -** unfortunate change but in the name of performance -** (and not smashing the stack). +** unfortunate change but if was a noble attempt +** in the name of performance (and not smashing the stack). ** +** But it is now a pretty ugly beast... */ #define MPC_RETURN(st, x) mpc_stack_set_state(stk, st); mpc_stack_pushp(stk, x); continue @@ -875,10 +1026,9 @@ int mpc_parse_input(mpc_input_t* i, mpc_parser_t* init, mpc_result_t* final) { switch (p->type) { - case MPC_TYPE_UNDEFINED: MPC_FAILURE(mpc_err_new_fail(i->filename, i->state, "Parser Undefined!")); - /* Trivial Parsers */ - + + case MPC_TYPE_UNDEFINED: MPC_FAILURE(mpc_err_new_fail(i->filename, i->state, "Parser Undefined!")); case MPC_TYPE_PASS: MPC_SUCCESS(NULL); case MPC_TYPE_FAIL: MPC_FAILURE(mpc_err_new_fail(i->filename, i->state, p->data.fail.m)); case MPC_TYPE_LIFT: MPC_SUCCESS(p->data.lift.lf()); @@ -888,7 +1038,6 @@ int mpc_parse_input(mpc_input_t* i, mpc_parser_t* init, mpc_result_t* final) { case MPC_TYPE_SOI: MPC_FUNCTION(NULL, mpc_input_soi(i)); case MPC_TYPE_EOI: MPC_FUNCTION(NULL, mpc_input_eoi(i)); - case MPC_TYPE_ANY: MPC_FUNCTION(s, mpc_input_any(i, &s)); case MPC_TYPE_SINGLE: MPC_FUNCTION(s, mpc_input_char(i, p->data.single.x, &s)); case MPC_TYPE_RANGE: MPC_FUNCTION(s, mpc_input_range(i, p->data.range.x, p->data.range.y, &s)); @@ -1510,7 +1659,7 @@ mpc_parser_t* mpc_apply_to(mpc_parser_t* a, mpc_apply_to_t f, void* x) { return p; } -mpc_parser_t* mpc_predict(mpc_parser_t* a) { +mpc_parser_t* mpc_predictive(mpc_parser_t* a) { mpc_parser_t* p = mpc_undefined(); p->type = MPC_TYPE_PREDICT; p->data.predict.x = a; @@ -1800,7 +1949,7 @@ mpc_parser_t* mpc_tok_squares(mpc_parser_t* a, mpc_dtor_t ad) { return mpc_tok_ ** ** ### Regular Expression Grammar ** -** : ( "|" ) | +** : | ( "|" ) ** ** : * ** @@ -1988,7 +2137,7 @@ mpc_parser_t* mpc_re(const char* re) { mpc_re_range )); - RegexEnclose = mpc_enclose(mpc_predict(Regex), (mpc_dtor_t)mpc_delete); + RegexEnclose = mpc_enclose(mpc_predictive(Regex), (mpc_dtor_t)mpc_delete); if(!mpc_parse("", re, RegexEnclose, &r)) { err_msg = mpc_err_string_new(r.error); @@ -2839,7 +2988,7 @@ mpc_parser_t* mpca_grammar_st(const char* grammar, mpca_grammar_st_t* st) { Base = mpc_new("base"); mpc_define(GrammarTotal, - mpc_apply(mpc_predict(mpc_total(Grammar, mpc_soft_delete)), mpcf_make_root) + mpc_apply(mpc_predictive(mpc_total(Grammar, mpc_soft_delete)), mpcf_make_root) ); mpc_define(Grammar, mpc_also( @@ -2990,7 +3139,7 @@ static mpc_err_t* mpca_lang_st(const char* language, mpca_grammar_st_t* st) { Base = mpc_new("base"); mpc_define(Lang, mpc_apply_to( - mpc_total(mpc_predict(mpc_many(Stmt, mpca_stmt_fold)), mpca_stmt_list_delete), + mpc_total(mpc_predictive(mpc_many(Stmt, mpca_stmt_fold)), mpca_stmt_list_delete), mpca_stmt_list_apply_to, st )); diff --git a/mpc.h b/mpc.h index f413442..9cf879d 100644 --- a/mpc.h +++ b/mpc.h @@ -102,7 +102,7 @@ mpc_parser_t* mpc_string(const char* s); mpc_parser_t* mpc_expect(mpc_parser_t* a, const char* expected); mpc_parser_t* mpc_apply(mpc_parser_t* a, mpc_apply_t f); mpc_parser_t* mpc_apply_to(mpc_parser_t* a, mpc_apply_to_t f, void* x); -mpc_parser_t* mpc_predict(mpc_parser_t* a); +mpc_parser_t* mpc_predictive(mpc_parser_t* a); mpc_parser_t* mpc_not(mpc_parser_t* a, mpc_dtor_t da); mpc_parser_t* mpc_not_else(mpc_parser_t* a, mpc_dtor_t da, mpc_lift_t lf); mpc_parser_t* mpc_maybe(mpc_parser_t* a);