From 4a992d91abb21bd46dc462a1c7bfef7fbacd39da Mon Sep 17 00:00:00 2001 From: Daniel Holden Date: Sun, 14 Oct 2018 17:20:11 -0400 Subject: [PATCH] Added mode option to regex and also changed example from a line reader to a tokenizer. --- README.md | 92 ++++++++++++++++++++++++++++--------------------- mpc.c | 79 +++++++++++++++++++++++++++++++++--------- mpc.h | 10 ++++++ package.json | 2 +- tests/core.c | 29 +++++++++++++++- tests/grammar.c | 62 ++++++++++++++++++++++++++++++++- tests/regex.c | 39 +++++++++++++++++++++ 7 files changed, 254 insertions(+), 59 deletions(-) diff --git a/README.md b/README.md index 55dcdfc..7075c78 100644 --- a/README.md +++ b/README.md @@ -560,6 +560,20 @@ This function makes a copy of a parser `a`. This can be useful when you want to use a parser as input for some other parsers multiple times without retaining it. +* * * + +```c +mpc_parser_t *mpc_re(const char *re); +mpc_parser_t *mpc_re_mode(const char *re, int mode); +``` + +This function takes as input the regular expression `re` and builds a parser +for it. With the `mpc_re_mode` function optional mode flags can also be given. +Available flags are `MPC_RE_MULTILINE` / `MPC_RE_M` where the start of input +character `^` also matches the beginning of new lines and the end of input `$` +character also matches new lines, and `MPC_RE_DOTALL` / `MPC_RE_S` where the +any character token `.` also matches newlines (by default it doesn't). + Library Reference ================= @@ -573,6 +587,7 @@ Common Parsers mpc_soiMatches only the start of input, returns NULL mpc_eoiMatches only the end of input, returns NULL mpc_boundaryMatches only the boundary between words, returns NULL + mpc_boundary_newlineMatches the start of a new line, returns NULL mpc_whitespaceMatches any whitespace character " \f\n\r\t\v" mpc_whitespacesMatches zero or more whitespace characters mpc_blankMatches whitespaces and frees the result, returns NULL @@ -807,65 +822,64 @@ mpc_err_t *mpca_lang_contents(int flags, const char *filename, ...); This opens and reads in the contents of the file given by `filename` and passes it to `mpca_lang`. -Case Study - Line Reader -======================== +Case Study - Tokenizer +====================== -Another common task we might be interested in doing is parsing a file line by line and doing something on each line we encounter. For this we can setup something like the following: +Another common task we might be interested in doing is tokenizing some block of +text (splitting the text into individual elements) and performing some function +on each one of these elements as it is read. We can do this with `mpc` too. -First, we can build a regular expression which parses a single line: `mpc_re("[^\\n]*(\\n|$)")`, next we can add a callback function using `mpc_apply` which gets called every time a line is parsed successfully `mpc_apply(mpc_re("[^\\n]*(\\n|$)"), read_line)`. Finally we can surround all of this in `mpc_many` to parse zero or more lines. The final thing might look something like this: +First, we can build a regular expression which parses an individual token. For +example if our tokens are identifiers, integers, commas, periods and colons we +could build something like this `mpc_re("\\s*([a-zA-Z_]+|[0-9]+|,|\\.|:)")`. +Next we can strip any whitespace, and add a callback function using `mpc_apply` +which gets called every time this regex is parsed successfully +`mpc_apply(mpc_strip(mpc_re("\\s*([a-zA-Z_]+|[0-9]+|,|\\.|:)")), print_token)`. +Finally we can surround all of this in `mpc_many` to parse it zero or more +times. The final code might look something like this: ```c -static void* read_line(void* line) { - printf("Reading Line: %s", (char*)line); - return line; +static mpc_val_t *print_token(mpc_val_t *x) { + printf("Token: '%s'\n", (char*)x); + return x; } int main(int argc, char **argv) { - const char *input = - "abcHVwufvyuevuy3y436782\n" - "\n" - "\n" - "rehre\n" - "rew\n" - "-ql.;qa\n" - "eg"; - - mpc_parser_t* Line = mpc_many( - mpcf_strfold, - mpc_apply(mpc_re("[^\\n]*(\\n|$)"), read_line)); + const char *input = " hello 4352 , \n foo.bar \n\n test:ing "; + + mpc_parser_t* Tokens = mpc_many( + mpcf_all_free, + mpc_apply(mpc_strip(mpc_re("\\s*([a-zA-Z_]+|[0-9]+|,|\\.|:)")), print_token)); mpc_result_t r; - - mpc_parse("input", input, Line, &r); - printf("\nParsed String: %s", (char*)r.output); - free(r.output); + mpc_parse("input", input, Tokens, &r); - mpc_delete(Line); + mpc_delete(Tokens); return 0; } ``` -This program will produce an output something like this: +Running this program will produce an output something like this: ``` -Reading Line: abcHVwufvyuevuy3y436782 -Reading Line: -Reading Line: -Reading Line: rehre -Reading Line: rew -Reading Line: -ql.;qa -Reading Line: eg -Parsed String: abcHVwufvyuevuy3y436782 - - -rehre -rew --ql.;qa -eg +Token: 'hello' +Token: '4352' +Token: ',' +Token: 'foo' +Token: '.' +Token: 'bar' +Token: 'test' +Token: ':' +Token: 'ing' ``` +By extending the regex we can easily extend this to parse many more types of +tokens and quickly and easily build a tokenizer for whatever language we are +interested in. + + Error Reporting =============== diff --git a/mpc.c b/mpc.c index 69540a6..6f9bac4 100644 --- a/mpc.c +++ b/mpc.c @@ -1979,7 +1979,13 @@ static int mpc_boundary_anchor(char prev, char next) { return 0; } -mpc_parser_t *mpc_boundary(void) { return mpc_expect(mpc_anchor(mpc_boundary_anchor), "boundary"); } +static int mpc_boundary_newline_anchor(char prev, char next) { + (void)next; + return prev == '\n'; +} + +mpc_parser_t *mpc_boundary(void) { return mpc_expect(mpc_anchor(mpc_boundary_anchor), "word boundary"); } +mpc_parser_t *mpc_boundary_newline(void) { return mpc_expect(mpc_anchor(mpc_boundary_newline_anchor), "start of newline"); } mpc_parser_t *mpc_whitespace(void) { return mpc_expect(mpc_oneof(" \f\n\r\t\v"), "whitespace"); } mpc_parser_t *mpc_whitespaces(void) { return mpc_expect(mpc_many(mpcf_strfold, mpc_whitespace()), "spaces"); } @@ -2192,19 +2198,44 @@ static mpc_parser_t *mpc_re_escape_char(char c) { } } -static mpc_val_t *mpcf_re_escape(mpc_val_t *x) { +static mpc_val_t *mpcf_re_escape(mpc_val_t *x, void* data) { + int mode = *((int*)data); char *s = x; mpc_parser_t *p; - /* Regex Special Characters */ - if (s[0] == '.') { free(s); return mpc_any(); } - if (s[0] == '^') { free(s); return mpc_and(2, mpcf_snd, mpc_soi(), mpc_lift(mpcf_ctor_str), free); } + /* Any Character */ + if (s[0] == '.') { + free(s); + if (mode & MPC_RE_DOTALL) { + return mpc_any(); + } else { + return mpc_expect(mpc_noneof("\n"), "any character except a newline"); + } + } + + /* Start of Input */ + if (s[0] == '^') { + free(s); + if (mode & MPC_RE_MULTILINE) { + return mpc_and(2, mpcf_snd, mpc_or(2, mpc_soi(), mpc_boundary_newline()), mpc_lift(mpcf_ctor_str), free); + } else { + return mpc_and(2, mpcf_snd, mpc_soi(), mpc_lift(mpcf_ctor_str), free); + } + } + + /* End of Input */ if (s[0] == '$') { free(s); - return mpc_or(2, - mpc_and(2, mpcf_fst, mpc_newline(), mpc_eoi(), free), - mpc_and(2, mpcf_snd, mpc_eoi(), mpc_lift(mpcf_ctor_str), free)); + if (mode & MPC_RE_MULTILINE) { + return mpc_or(2, + mpc_newline(), + mpc_and(2, mpcf_snd, mpc_eoi(), mpc_lift(mpcf_ctor_str), free)); + } else { + return mpc_or(2, + mpc_and(2, mpcf_fst, mpc_newline(), mpc_eoi(), free), + mpc_and(2, mpcf_snd, mpc_eoi(), mpc_lift(mpcf_ctor_str), free)); + } } /* Regex Escape */ @@ -2302,6 +2333,10 @@ static mpc_val_t *mpcf_re_range(mpc_val_t *x) { } mpc_parser_t *mpc_re(const char *re) { + return mpc_re_mode(re, MPC_RE_DEFAULT); +} + +mpc_parser_t *mpc_re_mode(const char *re, int mode) { char *err_msg; mpc_parser_t *err_out; @@ -2334,8 +2369,8 @@ mpc_parser_t *mpc_re(const char *re) { mpc_define(Base, mpc_or(4, mpc_parens(Regex, (mpc_dtor_t)mpc_delete), mpc_squares(Range, (mpc_dtor_t)mpc_delete), - mpc_apply(mpc_escape(), mpcf_re_escape), - mpc_apply(mpc_noneof(")|"), mpcf_re_escape) + mpc_apply_to(mpc_escape(), mpcf_re_escape, &mode), + mpc_apply_to(mpc_noneof(")|"), mpcf_re_escape, &mode) )); mpc_define(Range, mpc_apply( @@ -3320,7 +3355,7 @@ mpc_parser_t *mpca_total(mpc_parser_t *a) { return mpc_total(a, (mpc_dtor_t)mpc_ ** : "<" ( | ) ">" ** | ** | -** | +** | ** | "(" ")" */ @@ -3379,11 +3414,21 @@ static mpc_val_t *mpcaf_grammar_char(mpc_val_t *x, void *s) { return mpca_state(mpca_tag(mpc_apply(p, mpcf_str_ast), "char")); } -static mpc_val_t *mpcaf_grammar_regex(mpc_val_t *x, void *s) { - mpca_grammar_st_t *st = s; - char *y = mpcf_unescape_regex(x); - mpc_parser_t *p = (st->flags & MPCA_LANG_WHITESPACE_SENSITIVE) ? mpc_re(y) : mpc_tok(mpc_re(y)); +static mpc_val_t *mpcaf_fold_regex(int n, mpc_val_t **xs) { + char *y = xs[0]; + char *m = xs[1]; + mpca_grammar_st_t *st = xs[2]; + mpc_parser_t *p; + int mode = MPC_RE_DEFAULT; + + (void)n; + if (strchr(m, 'm')) { mode |= MPC_RE_MULTILINE; } + if (strchr(m, 's')) { mode |= MPC_RE_DOTALL; } + y = mpcf_unescape_regex(y); + p = (st->flags & MPCA_LANG_WHITESPACE_SENSITIVE) ? mpc_re_mode(y, mode) : mpc_tok(mpc_re_mode(y, mode)); free(y); + free(m); + return mpca_state(mpca_tag(mpc_apply(p, mpcf_str_ast), "regex")); } @@ -3496,7 +3541,7 @@ mpc_parser_t *mpca_grammar_st(const char *grammar, mpca_grammar_st_t *st) { mpc_define(Base, mpc_or(5, mpc_apply_to(mpc_tok(mpc_string_lit()), mpcaf_grammar_string, st), mpc_apply_to(mpc_tok(mpc_char_lit()), mpcaf_grammar_char, st), - mpc_apply_to(mpc_tok(mpc_regex_lit()), mpcaf_grammar_regex, st), + mpc_tok(mpc_and(3, mpcaf_fold_regex, mpc_regex_lit(), mpc_many(mpcf_strfold, mpc_oneof("ms")), mpc_lift_val(st), free, free)), mpc_apply_to(mpc_tok_braces(mpc_or(2, mpc_digits(), mpc_ident()), free), mpcaf_grammar_id, st), mpc_tok_parens(Grammar, mpc_soft_delete) )); @@ -3658,7 +3703,7 @@ static mpc_err_t *mpca_lang_st(mpc_input_t *i, mpca_grammar_st_t *st) { mpc_define(Base, mpc_or(5, mpc_apply_to(mpc_tok(mpc_string_lit()), mpcaf_grammar_string, st), mpc_apply_to(mpc_tok(mpc_char_lit()), mpcaf_grammar_char, st), - mpc_apply_to(mpc_tok(mpc_regex_lit()), mpcaf_grammar_regex, st), + mpc_tok(mpc_and(3, mpcaf_fold_regex, mpc_regex_lit(), mpc_many(mpcf_strfold, mpc_oneof("ms")), mpc_lift_val(st), free, free)), mpc_apply_to(mpc_tok_braces(mpc_or(2, mpc_digits(), mpc_ident()), free), mpcaf_grammar_id, st), mpc_tok_parens(Grammar, mpc_soft_delete) )); diff --git a/mpc.h b/mpc.h index 1e85f90..7ea12e3 100644 --- a/mpc.h +++ b/mpc.h @@ -156,6 +156,7 @@ mpc_parser_t *mpc_eoi(void); mpc_parser_t *mpc_soi(void); mpc_parser_t *mpc_boundary(void); +mpc_parser_t *mpc_boundary_newline(void); mpc_parser_t *mpc_whitespace(void); mpc_parser_t *mpc_whitespaces(void); @@ -264,7 +265,16 @@ mpc_val_t *mpcf_maths(int n, mpc_val_t** xs); ** Regular Expression Parsers */ +enum { + MPC_RE_DEFAULT = 0, + MPC_RE_M = 1, + MPC_RE_S = 2, + MPC_RE_MULTILINE = 1, + MPC_RE_DOTALL = 2 +}; + mpc_parser_t *mpc_re(const char *re); +mpc_parser_t *mpc_re_mode(const char *re, int mode); /* ** AST diff --git a/package.json b/package.json index adcb3bf..27738f7 100644 --- a/package.json +++ b/package.json @@ -1,6 +1,6 @@ { "name": "mpc", - "version": "0.8.8", + "version": "0.9.8", "repo": "orangeduck/mpc", "description": "A Parser Combinator library for C", "keywords": ["parser", "combinator", "library", "c", "mpc"], diff --git a/tests/core.c b/tests/core.c index bd450cf..80329c6 100644 --- a/tests/core.c +++ b/tests/core.c @@ -154,7 +154,7 @@ void test_copy(void) { static int line_count = 0; -static void* read_line(void* line) { +static mpc_val_t* read_line(mpc_val_t* line) { line_count++; return line; } @@ -185,6 +185,32 @@ void test_reader(void) { } +static int token_count = 0; + +static mpc_val_t *print_token(mpc_val_t *x) { + printf("Token: '%s'\n", (char*)x); + token_count++; + return x; +} + +void test_tokens(void) { + + mpc_parser_t* Tokens = mpc_many( + mpcf_strfold, + mpc_apply(mpc_strip(mpc_re("\\s*([a-zA-Z_]+|[0-9]+|,|\\.|:)")), print_token)); + + token_count = 0; + + PT_ASSERT(mpc_test_pass(Tokens, + " hello 4352 , \n foo.bar \n\n test:ing ", + "hello4352,foo.bartest:ing", streq, free, strprint)); + + PT_ASSERT(token_count == 9); + + mpc_delete(Tokens); + +} + void test_eoi(void) { mpc_parser_t* Line = mpc_re("[^\\n]*$"); @@ -203,5 +229,6 @@ void suite_core(void) { pt_add_test(test_repeat, "Test Repeat", "Suite Core"); pt_add_test(test_copy, "Test Copy", "Suite Core"); pt_add_test(test_reader, "Test Reader", "Suite Core"); + pt_add_test(test_tokens, "Test Tokens", "Suite Core"); pt_add_test(test_eoi, "Test EOI", "Suite Core"); } diff --git a/tests/grammar.c b/tests/grammar.c index f45fdcf..3e5134c 100644 --- a/tests/grammar.c +++ b/tests/grammar.c @@ -162,7 +162,9 @@ void test_partial(void) { (int(*)(const void*,const void*))mpc_ast_eq, (mpc_dtor_t)mpc_ast_delete, (void(*)(const void*))mpc_ast_print)); - + + mpc_ast_delete(t0); + mpc_cleanup(5, Line, Number, QuotedString, LinePragma, Parser); } @@ -248,6 +250,8 @@ void test_qscript(void) { (mpc_dtor_t)mpc_ast_delete, (void(*)(const void*))mpc_ast_print)); + mpc_ast_delete(t0); + mpc_cleanup(18, Qscript, Comment, Resource, Rtype, Rname, InnerBlock, Statement, Function, Parameter, Literal, Block, Seperator, Qstring, SimpleStr, ComplexStr, Number, Float, Int); @@ -278,6 +282,61 @@ void test_missingrule(void) { } +void test_regex_mode(void) { + + mpc_parser_t *Line0, *Line1, *Line2, *Line3; + mpc_ast_t *t0, *t1, *t2, *t3, *t4; + + Line0 = mpc_new("line0"); + Line1 = mpc_new("line1"); + Line2 = mpc_new("line2"); + Line3 = mpc_new("line3"); + + mpca_lang(MPCA_LANG_DEFAULT, " line0 : /.*/; ", Line0); + mpca_lang(MPCA_LANG_DEFAULT, " line1 : /.*/s; ", Line1); + mpca_lang(MPCA_LANG_DEFAULT, " line2 : /(^[a-z]*$)*/; ", Line2); + mpca_lang(MPCA_LANG_DEFAULT, " line3 : /(^[a-z]*$)*/m; ", Line3); + + t0 = mpc_ast_new("regex", "blah"); + t1 = mpc_ast_new("regex", "blah\nblah"); + t2 = mpc_ast_new("regex", ""); + t3 = mpc_ast_new("regex", "blah"); + t4 = mpc_ast_new("regex", "blah\nblah"); + + PT_ASSERT(mpc_test_pass(Line0, "blah\nblah", t0, + (int(*)(const void*,const void*))mpc_ast_eq, + (mpc_dtor_t)mpc_ast_delete, + (void(*)(const void*))mpc_ast_print)); + + PT_ASSERT(mpc_test_pass(Line1, "blah\nblah", t1, + (int(*)(const void*,const void*))mpc_ast_eq, + (mpc_dtor_t)mpc_ast_delete, + (void(*)(const void*))mpc_ast_print)); + + PT_ASSERT(mpc_test_pass(Line2, "blah\nblah", t2, + (int(*)(const void*,const void*))mpc_ast_eq, + (mpc_dtor_t)mpc_ast_delete, + (void(*)(const void*))mpc_ast_print)); + + PT_ASSERT(mpc_test_pass(Line2, "blah", t3, + (int(*)(const void*,const void*))mpc_ast_eq, + (mpc_dtor_t)mpc_ast_delete, + (void(*)(const void*))mpc_ast_print)); + + PT_ASSERT(mpc_test_pass(Line3, "blah\nblah", t4, + (int(*)(const void*,const void*))mpc_ast_eq, + (mpc_dtor_t)mpc_ast_delete, + (void(*)(const void*))mpc_ast_print)); + + mpc_ast_delete(t0); + mpc_ast_delete(t1); + mpc_ast_delete(t2); + mpc_ast_delete(t3); + mpc_ast_delete(t4); + + mpc_cleanup(4, Line0, Line1, Line2, Line3); +} + void suite_grammar(void) { pt_add_test(test_grammar, "Test Grammar", "Suite Grammar"); pt_add_test(test_language, "Test Language", "Suite Grammar"); @@ -286,4 +345,5 @@ void suite_grammar(void) { pt_add_test(test_partial, "Test Partial", "Suite Grammar"); pt_add_test(test_qscript, "Test QScript", "Suite Grammar"); pt_add_test(test_missingrule, "Test Missing Rule", "Suite Grammar"); + pt_add_test(test_regex_mode, "Test Regex Mode", "Suite Grammar"); } diff --git a/tests/regex.c b/tests/regex.c index 40f7ea1..f72dc5a 100644 --- a/tests/regex.c +++ b/tests/regex.c @@ -132,6 +132,43 @@ void test_regex_newline(void) { } +void test_regex_multiline(void) { + + mpc_parser_t *re0 = mpc_re_mode("(^[a-z]*$)*", MPC_RE_MULTILINE); + + PT_ASSERT(regex_test_pass(re0, "hello\nhello", "hello\nhello")); + PT_ASSERT(regex_test_pass(re0, "hello\nhello\n", "hello\nhello\n")); + PT_ASSERT(regex_test_pass(re0, "\nblah\n\nblah\n", "\nblah\n\nblah\n")); + PT_ASSERT(regex_test_fail(re0, "45234", "45234")); + PT_ASSERT(regex_test_fail(re0, "\n45234", "\n45234")); + PT_ASSERT(regex_test_pass(re0, "\n45234", "\n")); + + mpc_delete(re0); + +} + +void test_regex_dotall(void) { + + mpc_parser_t *re0 = mpc_re_mode("^.*$", MPC_RE_DEFAULT); + mpc_parser_t *re1 = mpc_re_mode("^.*$", MPC_RE_DOTALL); + + PT_ASSERT(regex_test_pass(re0, "hello", "hello")); + PT_ASSERT(regex_test_fail(re0, "hello\n", "hello")); + PT_ASSERT(regex_test_fail(re0, "he\nllo\n", "he")); + PT_ASSERT(regex_test_pass(re0, "34njaksdklmasd", "34njaksdklmasd")); + PT_ASSERT(regex_test_fail(re0, "34njaksd\nklmasd", "34njaksd")); + + PT_ASSERT(regex_test_pass(re1, "hello", "hello")); + PT_ASSERT(regex_test_pass(re1, "hello\n", "hello\n")); + PT_ASSERT(regex_test_pass(re1, "he\nllo\n", "he\nllo\n")); + PT_ASSERT(regex_test_pass(re1, "34njaksdklmasd", "34njaksdklmasd")); + PT_ASSERT(regex_test_pass(re1, "34njaksd\nklmasd", "34njaksd\nklmasd")); + + mpc_delete(re0); + mpc_delete(re1); + +} + void suite_regex(void) { pt_add_test(test_regex_basic, "Test Regex Basic", "Suite Regex"); pt_add_test(test_regex_range, "Test Regex Range", "Suite Regex"); @@ -139,4 +176,6 @@ void suite_regex(void) { pt_add_test(test_regex_lisp_comment, "Test Regex Lisp Comment", "Suite Regex"); pt_add_test(test_regex_boundary, "Test Regex Boundary", "Suite Regex"); pt_add_test(test_regex_newline, "Test Regex Newline", "Suite Regex"); + pt_add_test(test_regex_multiline, "Test Regex Multiline", "Suite Regex"); + pt_add_test(test_regex_dotall, "Test Regex Dotall", "Suite Regex"); }