diff --git a/README.md b/README.md
index 55dcdfc..7075c78 100644
--- a/README.md
+++ b/README.md
@@ -560,6 +560,20 @@ This function makes a copy of a parser `a`. This can be useful when you want to
use a parser as input for some other parsers multiple times without retaining
it.
+* * *
+
+```c
+mpc_parser_t *mpc_re(const char *re);
+mpc_parser_t *mpc_re_mode(const char *re, int mode);
+```
+
+This function takes as input the regular expression `re` and builds a parser
+for it. With the `mpc_re_mode` function optional mode flags can also be given.
+Available flags are `MPC_RE_MULTILINE` / `MPC_RE_M` where the start of input
+character `^` also matches the beginning of new lines and the end of input `$`
+character also matches new lines, and `MPC_RE_DOTALL` / `MPC_RE_S` where the
+any character token `.` also matches newlines (by default it doesn't).
+
Library Reference
=================
@@ -573,6 +587,7 @@ Common Parsers
mpc_soi | Matches only the start of input, returns NULL |
mpc_eoi | Matches only the end of input, returns NULL |
mpc_boundary | Matches only the boundary between words, returns NULL |
+ mpc_boundary_newline | Matches the start of a new line, returns NULL |
mpc_whitespace | Matches any whitespace character " \f\n\r\t\v" |
mpc_whitespaces | Matches zero or more whitespace characters |
mpc_blank | Matches whitespaces and frees the result, returns NULL |
@@ -807,65 +822,64 @@ mpc_err_t *mpca_lang_contents(int flags, const char *filename, ...);
This opens and reads in the contents of the file given by `filename` and passes it to `mpca_lang`.
-Case Study - Line Reader
-========================
+Case Study - Tokenizer
+======================
-Another common task we might be interested in doing is parsing a file line by line and doing something on each line we encounter. For this we can setup something like the following:
+Another common task we might be interested in doing is tokenizing some block of
+text (splitting the text into individual elements) and performing some function
+on each one of these elements as it is read. We can do this with `mpc` too.
-First, we can build a regular expression which parses a single line: `mpc_re("[^\\n]*(\\n|$)")`, next we can add a callback function using `mpc_apply` which gets called every time a line is parsed successfully `mpc_apply(mpc_re("[^\\n]*(\\n|$)"), read_line)`. Finally we can surround all of this in `mpc_many` to parse zero or more lines. The final thing might look something like this:
+First, we can build a regular expression which parses an individual token. For
+example if our tokens are identifiers, integers, commas, periods and colons we
+could build something like this `mpc_re("\\s*([a-zA-Z_]+|[0-9]+|,|\\.|:)")`.
+Next we can strip any whitespace, and add a callback function using `mpc_apply`
+which gets called every time this regex is parsed successfully
+`mpc_apply(mpc_strip(mpc_re("\\s*([a-zA-Z_]+|[0-9]+|,|\\.|:)")), print_token)`.
+Finally we can surround all of this in `mpc_many` to parse it zero or more
+times. The final code might look something like this:
```c
-static void* read_line(void* line) {
- printf("Reading Line: %s", (char*)line);
- return line;
+static mpc_val_t *print_token(mpc_val_t *x) {
+ printf("Token: '%s'\n", (char*)x);
+ return x;
}
int main(int argc, char **argv) {
- const char *input =
- "abcHVwufvyuevuy3y436782\n"
- "\n"
- "\n"
- "rehre\n"
- "rew\n"
- "-ql.;qa\n"
- "eg";
-
- mpc_parser_t* Line = mpc_many(
- mpcf_strfold,
- mpc_apply(mpc_re("[^\\n]*(\\n|$)"), read_line));
+ const char *input = " hello 4352 , \n foo.bar \n\n test:ing ";
+
+ mpc_parser_t* Tokens = mpc_many(
+ mpcf_all_free,
+ mpc_apply(mpc_strip(mpc_re("\\s*([a-zA-Z_]+|[0-9]+|,|\\.|:)")), print_token));
mpc_result_t r;
-
- mpc_parse("input", input, Line, &r);
- printf("\nParsed String: %s", (char*)r.output);
- free(r.output);
+ mpc_parse("input", input, Tokens, &r);
- mpc_delete(Line);
+ mpc_delete(Tokens);
return 0;
}
```
-This program will produce an output something like this:
+Running this program will produce an output something like this:
```
-Reading Line: abcHVwufvyuevuy3y436782
-Reading Line:
-Reading Line:
-Reading Line: rehre
-Reading Line: rew
-Reading Line: -ql.;qa
-Reading Line: eg
-Parsed String: abcHVwufvyuevuy3y436782
-
-
-rehre
-rew
--ql.;qa
-eg
+Token: 'hello'
+Token: '4352'
+Token: ','
+Token: 'foo'
+Token: '.'
+Token: 'bar'
+Token: 'test'
+Token: ':'
+Token: 'ing'
```
+By extending the regex we can easily extend this to parse many more types of
+tokens and quickly and easily build a tokenizer for whatever language we are
+interested in.
+
+
Error Reporting
===============
diff --git a/mpc.c b/mpc.c
index 69540a6..6f9bac4 100644
--- a/mpc.c
+++ b/mpc.c
@@ -1979,7 +1979,13 @@ static int mpc_boundary_anchor(char prev, char next) {
return 0;
}
-mpc_parser_t *mpc_boundary(void) { return mpc_expect(mpc_anchor(mpc_boundary_anchor), "boundary"); }
+static int mpc_boundary_newline_anchor(char prev, char next) {
+ (void)next;
+ return prev == '\n';
+}
+
+mpc_parser_t *mpc_boundary(void) { return mpc_expect(mpc_anchor(mpc_boundary_anchor), "word boundary"); }
+mpc_parser_t *mpc_boundary_newline(void) { return mpc_expect(mpc_anchor(mpc_boundary_newline_anchor), "start of newline"); }
mpc_parser_t *mpc_whitespace(void) { return mpc_expect(mpc_oneof(" \f\n\r\t\v"), "whitespace"); }
mpc_parser_t *mpc_whitespaces(void) { return mpc_expect(mpc_many(mpcf_strfold, mpc_whitespace()), "spaces"); }
@@ -2192,19 +2198,44 @@ static mpc_parser_t *mpc_re_escape_char(char c) {
}
}
-static mpc_val_t *mpcf_re_escape(mpc_val_t *x) {
+static mpc_val_t *mpcf_re_escape(mpc_val_t *x, void* data) {
+ int mode = *((int*)data);
char *s = x;
mpc_parser_t *p;
- /* Regex Special Characters */
- if (s[0] == '.') { free(s); return mpc_any(); }
- if (s[0] == '^') { free(s); return mpc_and(2, mpcf_snd, mpc_soi(), mpc_lift(mpcf_ctor_str), free); }
+ /* Any Character */
+ if (s[0] == '.') {
+ free(s);
+ if (mode & MPC_RE_DOTALL) {
+ return mpc_any();
+ } else {
+ return mpc_expect(mpc_noneof("\n"), "any character except a newline");
+ }
+ }
+
+ /* Start of Input */
+ if (s[0] == '^') {
+ free(s);
+ if (mode & MPC_RE_MULTILINE) {
+ return mpc_and(2, mpcf_snd, mpc_or(2, mpc_soi(), mpc_boundary_newline()), mpc_lift(mpcf_ctor_str), free);
+ } else {
+ return mpc_and(2, mpcf_snd, mpc_soi(), mpc_lift(mpcf_ctor_str), free);
+ }
+ }
+
+ /* End of Input */
if (s[0] == '$') {
free(s);
- return mpc_or(2,
- mpc_and(2, mpcf_fst, mpc_newline(), mpc_eoi(), free),
- mpc_and(2, mpcf_snd, mpc_eoi(), mpc_lift(mpcf_ctor_str), free));
+ if (mode & MPC_RE_MULTILINE) {
+ return mpc_or(2,
+ mpc_newline(),
+ mpc_and(2, mpcf_snd, mpc_eoi(), mpc_lift(mpcf_ctor_str), free));
+ } else {
+ return mpc_or(2,
+ mpc_and(2, mpcf_fst, mpc_newline(), mpc_eoi(), free),
+ mpc_and(2, mpcf_snd, mpc_eoi(), mpc_lift(mpcf_ctor_str), free));
+ }
}
/* Regex Escape */
@@ -2302,6 +2333,10 @@ static mpc_val_t *mpcf_re_range(mpc_val_t *x) {
}
mpc_parser_t *mpc_re(const char *re) {
+ return mpc_re_mode(re, MPC_RE_DEFAULT);
+}
+
+mpc_parser_t *mpc_re_mode(const char *re, int mode) {
char *err_msg;
mpc_parser_t *err_out;
@@ -2334,8 +2369,8 @@ mpc_parser_t *mpc_re(const char *re) {
mpc_define(Base, mpc_or(4,
mpc_parens(Regex, (mpc_dtor_t)mpc_delete),
mpc_squares(Range, (mpc_dtor_t)mpc_delete),
- mpc_apply(mpc_escape(), mpcf_re_escape),
- mpc_apply(mpc_noneof(")|"), mpcf_re_escape)
+ mpc_apply_to(mpc_escape(), mpcf_re_escape, &mode),
+ mpc_apply_to(mpc_noneof(")|"), mpcf_re_escape, &mode)
));
mpc_define(Range, mpc_apply(
@@ -3320,7 +3355,7 @@ mpc_parser_t *mpca_total(mpc_parser_t *a) { return mpc_total(a, (mpc_dtor_t)mpc_
** : "<" ( | ) ">"
** |
** |
-** |
+** |
** | "(" ")"
*/
@@ -3379,11 +3414,21 @@ static mpc_val_t *mpcaf_grammar_char(mpc_val_t *x, void *s) {
return mpca_state(mpca_tag(mpc_apply(p, mpcf_str_ast), "char"));
}
-static mpc_val_t *mpcaf_grammar_regex(mpc_val_t *x, void *s) {
- mpca_grammar_st_t *st = s;
- char *y = mpcf_unescape_regex(x);
- mpc_parser_t *p = (st->flags & MPCA_LANG_WHITESPACE_SENSITIVE) ? mpc_re(y) : mpc_tok(mpc_re(y));
+static mpc_val_t *mpcaf_fold_regex(int n, mpc_val_t **xs) {
+ char *y = xs[0];
+ char *m = xs[1];
+ mpca_grammar_st_t *st = xs[2];
+ mpc_parser_t *p;
+ int mode = MPC_RE_DEFAULT;
+
+ (void)n;
+ if (strchr(m, 'm')) { mode |= MPC_RE_MULTILINE; }
+ if (strchr(m, 's')) { mode |= MPC_RE_DOTALL; }
+ y = mpcf_unescape_regex(y);
+ p = (st->flags & MPCA_LANG_WHITESPACE_SENSITIVE) ? mpc_re_mode(y, mode) : mpc_tok(mpc_re_mode(y, mode));
free(y);
+ free(m);
+
return mpca_state(mpca_tag(mpc_apply(p, mpcf_str_ast), "regex"));
}
@@ -3496,7 +3541,7 @@ mpc_parser_t *mpca_grammar_st(const char *grammar, mpca_grammar_st_t *st) {
mpc_define(Base, mpc_or(5,
mpc_apply_to(mpc_tok(mpc_string_lit()), mpcaf_grammar_string, st),
mpc_apply_to(mpc_tok(mpc_char_lit()), mpcaf_grammar_char, st),
- mpc_apply_to(mpc_tok(mpc_regex_lit()), mpcaf_grammar_regex, st),
+ mpc_tok(mpc_and(3, mpcaf_fold_regex, mpc_regex_lit(), mpc_many(mpcf_strfold, mpc_oneof("ms")), mpc_lift_val(st), free, free)),
mpc_apply_to(mpc_tok_braces(mpc_or(2, mpc_digits(), mpc_ident()), free), mpcaf_grammar_id, st),
mpc_tok_parens(Grammar, mpc_soft_delete)
));
@@ -3658,7 +3703,7 @@ static mpc_err_t *mpca_lang_st(mpc_input_t *i, mpca_grammar_st_t *st) {
mpc_define(Base, mpc_or(5,
mpc_apply_to(mpc_tok(mpc_string_lit()), mpcaf_grammar_string, st),
mpc_apply_to(mpc_tok(mpc_char_lit()), mpcaf_grammar_char, st),
- mpc_apply_to(mpc_tok(mpc_regex_lit()), mpcaf_grammar_regex, st),
+ mpc_tok(mpc_and(3, mpcaf_fold_regex, mpc_regex_lit(), mpc_many(mpcf_strfold, mpc_oneof("ms")), mpc_lift_val(st), free, free)),
mpc_apply_to(mpc_tok_braces(mpc_or(2, mpc_digits(), mpc_ident()), free), mpcaf_grammar_id, st),
mpc_tok_parens(Grammar, mpc_soft_delete)
));
diff --git a/mpc.h b/mpc.h
index 1e85f90..7ea12e3 100644
--- a/mpc.h
+++ b/mpc.h
@@ -156,6 +156,7 @@ mpc_parser_t *mpc_eoi(void);
mpc_parser_t *mpc_soi(void);
mpc_parser_t *mpc_boundary(void);
+mpc_parser_t *mpc_boundary_newline(void);
mpc_parser_t *mpc_whitespace(void);
mpc_parser_t *mpc_whitespaces(void);
@@ -264,7 +265,16 @@ mpc_val_t *mpcf_maths(int n, mpc_val_t** xs);
** Regular Expression Parsers
*/
+enum {
+ MPC_RE_DEFAULT = 0,
+ MPC_RE_M = 1,
+ MPC_RE_S = 2,
+ MPC_RE_MULTILINE = 1,
+ MPC_RE_DOTALL = 2
+};
+
mpc_parser_t *mpc_re(const char *re);
+mpc_parser_t *mpc_re_mode(const char *re, int mode);
/*
** AST
diff --git a/package.json b/package.json
index adcb3bf..27738f7 100644
--- a/package.json
+++ b/package.json
@@ -1,6 +1,6 @@
{
"name": "mpc",
- "version": "0.8.8",
+ "version": "0.9.8",
"repo": "orangeduck/mpc",
"description": "A Parser Combinator library for C",
"keywords": ["parser", "combinator", "library", "c", "mpc"],
diff --git a/tests/core.c b/tests/core.c
index bd450cf..80329c6 100644
--- a/tests/core.c
+++ b/tests/core.c
@@ -154,7 +154,7 @@ void test_copy(void) {
static int line_count = 0;
-static void* read_line(void* line) {
+static mpc_val_t* read_line(mpc_val_t* line) {
line_count++;
return line;
}
@@ -185,6 +185,32 @@ void test_reader(void) {
}
+static int token_count = 0;
+
+static mpc_val_t *print_token(mpc_val_t *x) {
+ printf("Token: '%s'\n", (char*)x);
+ token_count++;
+ return x;
+}
+
+void test_tokens(void) {
+
+ mpc_parser_t* Tokens = mpc_many(
+ mpcf_strfold,
+ mpc_apply(mpc_strip(mpc_re("\\s*([a-zA-Z_]+|[0-9]+|,|\\.|:)")), print_token));
+
+ token_count = 0;
+
+ PT_ASSERT(mpc_test_pass(Tokens,
+ " hello 4352 , \n foo.bar \n\n test:ing ",
+ "hello4352,foo.bartest:ing", streq, free, strprint));
+
+ PT_ASSERT(token_count == 9);
+
+ mpc_delete(Tokens);
+
+}
+
void test_eoi(void) {
mpc_parser_t* Line = mpc_re("[^\\n]*$");
@@ -203,5 +229,6 @@ void suite_core(void) {
pt_add_test(test_repeat, "Test Repeat", "Suite Core");
pt_add_test(test_copy, "Test Copy", "Suite Core");
pt_add_test(test_reader, "Test Reader", "Suite Core");
+ pt_add_test(test_tokens, "Test Tokens", "Suite Core");
pt_add_test(test_eoi, "Test EOI", "Suite Core");
}
diff --git a/tests/grammar.c b/tests/grammar.c
index f45fdcf..3e5134c 100644
--- a/tests/grammar.c
+++ b/tests/grammar.c
@@ -162,7 +162,9 @@ void test_partial(void) {
(int(*)(const void*,const void*))mpc_ast_eq,
(mpc_dtor_t)mpc_ast_delete,
(void(*)(const void*))mpc_ast_print));
-
+
+ mpc_ast_delete(t0);
+
mpc_cleanup(5, Line, Number, QuotedString, LinePragma, Parser);
}
@@ -248,6 +250,8 @@ void test_qscript(void) {
(mpc_dtor_t)mpc_ast_delete,
(void(*)(const void*))mpc_ast_print));
+ mpc_ast_delete(t0);
+
mpc_cleanup(18, Qscript, Comment, Resource, Rtype, Rname, InnerBlock,
Statement, Function, Parameter, Literal, Block, Seperator, Qstring,
SimpleStr, ComplexStr, Number, Float, Int);
@@ -278,6 +282,61 @@ void test_missingrule(void) {
}
+void test_regex_mode(void) {
+
+ mpc_parser_t *Line0, *Line1, *Line2, *Line3;
+ mpc_ast_t *t0, *t1, *t2, *t3, *t4;
+
+ Line0 = mpc_new("line0");
+ Line1 = mpc_new("line1");
+ Line2 = mpc_new("line2");
+ Line3 = mpc_new("line3");
+
+ mpca_lang(MPCA_LANG_DEFAULT, " line0 : /.*/; ", Line0);
+ mpca_lang(MPCA_LANG_DEFAULT, " line1 : /.*/s; ", Line1);
+ mpca_lang(MPCA_LANG_DEFAULT, " line2 : /(^[a-z]*$)*/; ", Line2);
+ mpca_lang(MPCA_LANG_DEFAULT, " line3 : /(^[a-z]*$)*/m; ", Line3);
+
+ t0 = mpc_ast_new("regex", "blah");
+ t1 = mpc_ast_new("regex", "blah\nblah");
+ t2 = mpc_ast_new("regex", "");
+ t3 = mpc_ast_new("regex", "blah");
+ t4 = mpc_ast_new("regex", "blah\nblah");
+
+ PT_ASSERT(mpc_test_pass(Line0, "blah\nblah", t0,
+ (int(*)(const void*,const void*))mpc_ast_eq,
+ (mpc_dtor_t)mpc_ast_delete,
+ (void(*)(const void*))mpc_ast_print));
+
+ PT_ASSERT(mpc_test_pass(Line1, "blah\nblah", t1,
+ (int(*)(const void*,const void*))mpc_ast_eq,
+ (mpc_dtor_t)mpc_ast_delete,
+ (void(*)(const void*))mpc_ast_print));
+
+ PT_ASSERT(mpc_test_pass(Line2, "blah\nblah", t2,
+ (int(*)(const void*,const void*))mpc_ast_eq,
+ (mpc_dtor_t)mpc_ast_delete,
+ (void(*)(const void*))mpc_ast_print));
+
+ PT_ASSERT(mpc_test_pass(Line2, "blah", t3,
+ (int(*)(const void*,const void*))mpc_ast_eq,
+ (mpc_dtor_t)mpc_ast_delete,
+ (void(*)(const void*))mpc_ast_print));
+
+ PT_ASSERT(mpc_test_pass(Line3, "blah\nblah", t4,
+ (int(*)(const void*,const void*))mpc_ast_eq,
+ (mpc_dtor_t)mpc_ast_delete,
+ (void(*)(const void*))mpc_ast_print));
+
+ mpc_ast_delete(t0);
+ mpc_ast_delete(t1);
+ mpc_ast_delete(t2);
+ mpc_ast_delete(t3);
+ mpc_ast_delete(t4);
+
+ mpc_cleanup(4, Line0, Line1, Line2, Line3);
+}
+
void suite_grammar(void) {
pt_add_test(test_grammar, "Test Grammar", "Suite Grammar");
pt_add_test(test_language, "Test Language", "Suite Grammar");
@@ -286,4 +345,5 @@ void suite_grammar(void) {
pt_add_test(test_partial, "Test Partial", "Suite Grammar");
pt_add_test(test_qscript, "Test QScript", "Suite Grammar");
pt_add_test(test_missingrule, "Test Missing Rule", "Suite Grammar");
+ pt_add_test(test_regex_mode, "Test Regex Mode", "Suite Grammar");
}
diff --git a/tests/regex.c b/tests/regex.c
index 40f7ea1..f72dc5a 100644
--- a/tests/regex.c
+++ b/tests/regex.c
@@ -132,6 +132,43 @@ void test_regex_newline(void) {
}
+void test_regex_multiline(void) {
+
+ mpc_parser_t *re0 = mpc_re_mode("(^[a-z]*$)*", MPC_RE_MULTILINE);
+
+ PT_ASSERT(regex_test_pass(re0, "hello\nhello", "hello\nhello"));
+ PT_ASSERT(regex_test_pass(re0, "hello\nhello\n", "hello\nhello\n"));
+ PT_ASSERT(regex_test_pass(re0, "\nblah\n\nblah\n", "\nblah\n\nblah\n"));
+ PT_ASSERT(regex_test_fail(re0, "45234", "45234"));
+ PT_ASSERT(regex_test_fail(re0, "\n45234", "\n45234"));
+ PT_ASSERT(regex_test_pass(re0, "\n45234", "\n"));
+
+ mpc_delete(re0);
+
+}
+
+void test_regex_dotall(void) {
+
+ mpc_parser_t *re0 = mpc_re_mode("^.*$", MPC_RE_DEFAULT);
+ mpc_parser_t *re1 = mpc_re_mode("^.*$", MPC_RE_DOTALL);
+
+ PT_ASSERT(regex_test_pass(re0, "hello", "hello"));
+ PT_ASSERT(regex_test_fail(re0, "hello\n", "hello"));
+ PT_ASSERT(regex_test_fail(re0, "he\nllo\n", "he"));
+ PT_ASSERT(regex_test_pass(re0, "34njaksdklmasd", "34njaksdklmasd"));
+ PT_ASSERT(regex_test_fail(re0, "34njaksd\nklmasd", "34njaksd"));
+
+ PT_ASSERT(regex_test_pass(re1, "hello", "hello"));
+ PT_ASSERT(regex_test_pass(re1, "hello\n", "hello\n"));
+ PT_ASSERT(regex_test_pass(re1, "he\nllo\n", "he\nllo\n"));
+ PT_ASSERT(regex_test_pass(re1, "34njaksdklmasd", "34njaksdklmasd"));
+ PT_ASSERT(regex_test_pass(re1, "34njaksd\nklmasd", "34njaksd\nklmasd"));
+
+ mpc_delete(re0);
+ mpc_delete(re1);
+
+}
+
void suite_regex(void) {
pt_add_test(test_regex_basic, "Test Regex Basic", "Suite Regex");
pt_add_test(test_regex_range, "Test Regex Range", "Suite Regex");
@@ -139,4 +176,6 @@ void suite_regex(void) {
pt_add_test(test_regex_lisp_comment, "Test Regex Lisp Comment", "Suite Regex");
pt_add_test(test_regex_boundary, "Test Regex Boundary", "Suite Regex");
pt_add_test(test_regex_newline, "Test Regex Newline", "Suite Regex");
+ pt_add_test(test_regex_multiline, "Test Regex Multiline", "Suite Regex");
+ pt_add_test(test_regex_dotall, "Test Regex Dotall", "Suite Regex");
}