Added mode option to regex and also changed example from a line reader to a tokenizer.

This commit is contained in:
Daniel Holden
2018-10-14 17:20:11 -04:00
parent 95439eb9c8
commit 4a992d91ab
7 changed files with 254 additions and 59 deletions

View File

@@ -560,6 +560,20 @@ This function makes a copy of a parser `a`. This can be useful when you want to
use a parser as input for some other parsers multiple times without retaining
it.
* * *
```c
mpc_parser_t *mpc_re(const char *re);
mpc_parser_t *mpc_re_mode(const char *re, int mode);
```
This function takes as input the regular expression `re` and builds a parser
for it. With the `mpc_re_mode` function optional mode flags can also be given.
Available flags are `MPC_RE_MULTILINE` / `MPC_RE_M` where the start of input
character `^` also matches the beginning of new lines and the end of input `$`
character also matches new lines, and `MPC_RE_DOTALL` / `MPC_RE_S` where the
any character token `.` also matches newlines (by default it doesn't).
Library Reference
=================
@@ -573,6 +587,7 @@ Common Parsers
<tr><td><code>mpc_soi</code></td><td>Matches only the start of input, returns <code>NULL</code></td></tr>
<tr><td><code>mpc_eoi</code></td><td>Matches only the end of input, returns <code>NULL</code></td></tr>
<tr><td><code>mpc_boundary</code></td><td>Matches only the boundary between words, returns <code>NULL</code></td></tr>
<tr><td><code>mpc_boundary_newline</code></td><td>Matches the start of a new line, returns <code>NULL</code></td></tr>
<tr><td><code>mpc_whitespace</code></td><td>Matches any whitespace character <code>" \f\n\r\t\v"</code></td></tr>
<tr><td><code>mpc_whitespaces</code></td><td>Matches zero or more whitespace characters</td></tr>
<tr><td><code>mpc_blank</code></td><td>Matches whitespaces and frees the result, returns <code>NULL</code></td></tr>
@@ -807,65 +822,64 @@ mpc_err_t *mpca_lang_contents(int flags, const char *filename, ...);
This opens and reads in the contents of the file given by `filename` and passes it to `mpca_lang`.
Case Study - Line Reader
========================
Case Study - Tokenizer
======================
Another common task we might be interested in doing is parsing a file line by line and doing something on each line we encounter. For this we can setup something like the following:
Another common task we might be interested in doing is tokenizing some block of
text (splitting the text into individual elements) and performing some function
on each one of these elements as it is read. We can do this with `mpc` too.
First, we can build a regular expression which parses a single line: `mpc_re("[^\\n]*(\\n|$)")`, next we can add a callback function using `mpc_apply` which gets called every time a line is parsed successfully `mpc_apply(mpc_re("[^\\n]*(\\n|$)"), read_line)`. Finally we can surround all of this in `mpc_many` to parse zero or more lines. The final thing might look something like this:
First, we can build a regular expression which parses an individual token. For
example if our tokens are identifiers, integers, commas, periods and colons we
could build something like this `mpc_re("\\s*([a-zA-Z_]+|[0-9]+|,|\\.|:)")`.
Next we can strip any whitespace, and add a callback function using `mpc_apply`
which gets called every time this regex is parsed successfully
`mpc_apply(mpc_strip(mpc_re("\\s*([a-zA-Z_]+|[0-9]+|,|\\.|:)")), print_token)`.
Finally we can surround all of this in `mpc_many` to parse it zero or more
times. The final code might look something like this:
```c
static void* read_line(void* line) {
printf("Reading Line: %s", (char*)line);
return line;
static mpc_val_t *print_token(mpc_val_t *x) {
printf("Token: '%s'\n", (char*)x);
return x;
}
int main(int argc, char **argv) {
const char *input =
"abcHVwufvyuevuy3y436782\n"
"\n"
"\n"
"rehre\n"
"rew\n"
"-ql.;qa\n"
"eg";
const char *input = " hello 4352 , \n foo.bar \n\n test:ing ";
mpc_parser_t* Line = mpc_many(
mpcf_strfold,
mpc_apply(mpc_re("[^\\n]*(\\n|$)"), read_line));
mpc_parser_t* Tokens = mpc_many(
mpcf_all_free,
mpc_apply(mpc_strip(mpc_re("\\s*([a-zA-Z_]+|[0-9]+|,|\\.|:)")), print_token));
mpc_result_t r;
mpc_parse("input", input, Tokens, &r);
mpc_parse("input", input, Line, &r);
printf("\nParsed String: %s", (char*)r.output);
free(r.output);
mpc_delete(Line);
mpc_delete(Tokens);
return 0;
}
```
This program will produce an output something like this:
Running this program will produce an output something like this:
```
Reading Line: abcHVwufvyuevuy3y436782
Reading Line:
Reading Line:
Reading Line: rehre
Reading Line: rew
Reading Line: -ql.;qa
Reading Line: eg
Parsed String: abcHVwufvyuevuy3y436782
rehre
rew
-ql.;qa
eg
Token: 'hello'
Token: '4352'
Token: ','
Token: 'foo'
Token: '.'
Token: 'bar'
Token: 'test'
Token: ':'
Token: 'ing'
```
By extending the regex we can easily extend this to parse many more types of
tokens and quickly and easily build a tokenizer for whatever language we are
interested in.
Error Reporting
===============

79
mpc.c
View File

@@ -1979,7 +1979,13 @@ static int mpc_boundary_anchor(char prev, char next) {
return 0;
}
mpc_parser_t *mpc_boundary(void) { return mpc_expect(mpc_anchor(mpc_boundary_anchor), "boundary"); }
static int mpc_boundary_newline_anchor(char prev, char next) {
(void)next;
return prev == '\n';
}
mpc_parser_t *mpc_boundary(void) { return mpc_expect(mpc_anchor(mpc_boundary_anchor), "word boundary"); }
mpc_parser_t *mpc_boundary_newline(void) { return mpc_expect(mpc_anchor(mpc_boundary_newline_anchor), "start of newline"); }
mpc_parser_t *mpc_whitespace(void) { return mpc_expect(mpc_oneof(" \f\n\r\t\v"), "whitespace"); }
mpc_parser_t *mpc_whitespaces(void) { return mpc_expect(mpc_many(mpcf_strfold, mpc_whitespace()), "spaces"); }
@@ -2192,19 +2198,44 @@ static mpc_parser_t *mpc_re_escape_char(char c) {
}
}
static mpc_val_t *mpcf_re_escape(mpc_val_t *x) {
static mpc_val_t *mpcf_re_escape(mpc_val_t *x, void* data) {
int mode = *((int*)data);
char *s = x;
mpc_parser_t *p;
/* Regex Special Characters */
if (s[0] == '.') { free(s); return mpc_any(); }
if (s[0] == '^') { free(s); return mpc_and(2, mpcf_snd, mpc_soi(), mpc_lift(mpcf_ctor_str), free); }
/* Any Character */
if (s[0] == '.') {
free(s);
if (mode & MPC_RE_DOTALL) {
return mpc_any();
} else {
return mpc_expect(mpc_noneof("\n"), "any character except a newline");
}
}
/* Start of Input */
if (s[0] == '^') {
free(s);
if (mode & MPC_RE_MULTILINE) {
return mpc_and(2, mpcf_snd, mpc_or(2, mpc_soi(), mpc_boundary_newline()), mpc_lift(mpcf_ctor_str), free);
} else {
return mpc_and(2, mpcf_snd, mpc_soi(), mpc_lift(mpcf_ctor_str), free);
}
}
/* End of Input */
if (s[0] == '$') {
free(s);
return mpc_or(2,
mpc_and(2, mpcf_fst, mpc_newline(), mpc_eoi(), free),
mpc_and(2, mpcf_snd, mpc_eoi(), mpc_lift(mpcf_ctor_str), free));
if (mode & MPC_RE_MULTILINE) {
return mpc_or(2,
mpc_newline(),
mpc_and(2, mpcf_snd, mpc_eoi(), mpc_lift(mpcf_ctor_str), free));
} else {
return mpc_or(2,
mpc_and(2, mpcf_fst, mpc_newline(), mpc_eoi(), free),
mpc_and(2, mpcf_snd, mpc_eoi(), mpc_lift(mpcf_ctor_str), free));
}
}
/* Regex Escape */
@@ -2302,6 +2333,10 @@ static mpc_val_t *mpcf_re_range(mpc_val_t *x) {
}
mpc_parser_t *mpc_re(const char *re) {
return mpc_re_mode(re, MPC_RE_DEFAULT);
}
mpc_parser_t *mpc_re_mode(const char *re, int mode) {
char *err_msg;
mpc_parser_t *err_out;
@@ -2334,8 +2369,8 @@ mpc_parser_t *mpc_re(const char *re) {
mpc_define(Base, mpc_or(4,
mpc_parens(Regex, (mpc_dtor_t)mpc_delete),
mpc_squares(Range, (mpc_dtor_t)mpc_delete),
mpc_apply(mpc_escape(), mpcf_re_escape),
mpc_apply(mpc_noneof(")|"), mpcf_re_escape)
mpc_apply_to(mpc_escape(), mpcf_re_escape, &mode),
mpc_apply_to(mpc_noneof(")|"), mpcf_re_escape, &mode)
));
mpc_define(Range, mpc_apply(
@@ -3320,7 +3355,7 @@ mpc_parser_t *mpca_total(mpc_parser_t *a) { return mpc_total(a, (mpc_dtor_t)mpc_
** <base> : "<" (<digits> | <ident>) ">"
** | <string_lit>
** | <char_lit>
** | <regex_lit>
** | <regex_lit> <regex_mode>
** | "(" <grammar> ")"
*/
@@ -3379,11 +3414,21 @@ static mpc_val_t *mpcaf_grammar_char(mpc_val_t *x, void *s) {
return mpca_state(mpca_tag(mpc_apply(p, mpcf_str_ast), "char"));
}
static mpc_val_t *mpcaf_grammar_regex(mpc_val_t *x, void *s) {
mpca_grammar_st_t *st = s;
char *y = mpcf_unescape_regex(x);
mpc_parser_t *p = (st->flags & MPCA_LANG_WHITESPACE_SENSITIVE) ? mpc_re(y) : mpc_tok(mpc_re(y));
static mpc_val_t *mpcaf_fold_regex(int n, mpc_val_t **xs) {
char *y = xs[0];
char *m = xs[1];
mpca_grammar_st_t *st = xs[2];
mpc_parser_t *p;
int mode = MPC_RE_DEFAULT;
(void)n;
if (strchr(m, 'm')) { mode |= MPC_RE_MULTILINE; }
if (strchr(m, 's')) { mode |= MPC_RE_DOTALL; }
y = mpcf_unescape_regex(y);
p = (st->flags & MPCA_LANG_WHITESPACE_SENSITIVE) ? mpc_re_mode(y, mode) : mpc_tok(mpc_re_mode(y, mode));
free(y);
free(m);
return mpca_state(mpca_tag(mpc_apply(p, mpcf_str_ast), "regex"));
}
@@ -3496,7 +3541,7 @@ mpc_parser_t *mpca_grammar_st(const char *grammar, mpca_grammar_st_t *st) {
mpc_define(Base, mpc_or(5,
mpc_apply_to(mpc_tok(mpc_string_lit()), mpcaf_grammar_string, st),
mpc_apply_to(mpc_tok(mpc_char_lit()), mpcaf_grammar_char, st),
mpc_apply_to(mpc_tok(mpc_regex_lit()), mpcaf_grammar_regex, st),
mpc_tok(mpc_and(3, mpcaf_fold_regex, mpc_regex_lit(), mpc_many(mpcf_strfold, mpc_oneof("ms")), mpc_lift_val(st), free, free)),
mpc_apply_to(mpc_tok_braces(mpc_or(2, mpc_digits(), mpc_ident()), free), mpcaf_grammar_id, st),
mpc_tok_parens(Grammar, mpc_soft_delete)
));
@@ -3658,7 +3703,7 @@ static mpc_err_t *mpca_lang_st(mpc_input_t *i, mpca_grammar_st_t *st) {
mpc_define(Base, mpc_or(5,
mpc_apply_to(mpc_tok(mpc_string_lit()), mpcaf_grammar_string, st),
mpc_apply_to(mpc_tok(mpc_char_lit()), mpcaf_grammar_char, st),
mpc_apply_to(mpc_tok(mpc_regex_lit()), mpcaf_grammar_regex, st),
mpc_tok(mpc_and(3, mpcaf_fold_regex, mpc_regex_lit(), mpc_many(mpcf_strfold, mpc_oneof("ms")), mpc_lift_val(st), free, free)),
mpc_apply_to(mpc_tok_braces(mpc_or(2, mpc_digits(), mpc_ident()), free), mpcaf_grammar_id, st),
mpc_tok_parens(Grammar, mpc_soft_delete)
));

10
mpc.h
View File

@@ -156,6 +156,7 @@ mpc_parser_t *mpc_eoi(void);
mpc_parser_t *mpc_soi(void);
mpc_parser_t *mpc_boundary(void);
mpc_parser_t *mpc_boundary_newline(void);
mpc_parser_t *mpc_whitespace(void);
mpc_parser_t *mpc_whitespaces(void);
@@ -264,7 +265,16 @@ mpc_val_t *mpcf_maths(int n, mpc_val_t** xs);
** Regular Expression Parsers
*/
enum {
MPC_RE_DEFAULT = 0,
MPC_RE_M = 1,
MPC_RE_S = 2,
MPC_RE_MULTILINE = 1,
MPC_RE_DOTALL = 2
};
mpc_parser_t *mpc_re(const char *re);
mpc_parser_t *mpc_re_mode(const char *re, int mode);
/*
** AST

View File

@@ -1,6 +1,6 @@
{
"name": "mpc",
"version": "0.8.8",
"version": "0.9.8",
"repo": "orangeduck/mpc",
"description": "A Parser Combinator library for C",
"keywords": ["parser", "combinator", "library", "c", "mpc"],

View File

@@ -154,7 +154,7 @@ void test_copy(void) {
static int line_count = 0;
static void* read_line(void* line) {
static mpc_val_t* read_line(mpc_val_t* line) {
line_count++;
return line;
}
@@ -185,6 +185,32 @@ void test_reader(void) {
}
static int token_count = 0;
static mpc_val_t *print_token(mpc_val_t *x) {
printf("Token: '%s'\n", (char*)x);
token_count++;
return x;
}
void test_tokens(void) {
mpc_parser_t* Tokens = mpc_many(
mpcf_strfold,
mpc_apply(mpc_strip(mpc_re("\\s*([a-zA-Z_]+|[0-9]+|,|\\.|:)")), print_token));
token_count = 0;
PT_ASSERT(mpc_test_pass(Tokens,
" hello 4352 , \n foo.bar \n\n test:ing ",
"hello4352,foo.bartest:ing", streq, free, strprint));
PT_ASSERT(token_count == 9);
mpc_delete(Tokens);
}
void test_eoi(void) {
mpc_parser_t* Line = mpc_re("[^\\n]*$");
@@ -203,5 +229,6 @@ void suite_core(void) {
pt_add_test(test_repeat, "Test Repeat", "Suite Core");
pt_add_test(test_copy, "Test Copy", "Suite Core");
pt_add_test(test_reader, "Test Reader", "Suite Core");
pt_add_test(test_tokens, "Test Tokens", "Suite Core");
pt_add_test(test_eoi, "Test EOI", "Suite Core");
}

View File

@@ -163,6 +163,8 @@ void test_partial(void) {
(mpc_dtor_t)mpc_ast_delete,
(void(*)(const void*))mpc_ast_print));
mpc_ast_delete(t0);
mpc_cleanup(5, Line, Number, QuotedString, LinePragma, Parser);
}
@@ -248,6 +250,8 @@ void test_qscript(void) {
(mpc_dtor_t)mpc_ast_delete,
(void(*)(const void*))mpc_ast_print));
mpc_ast_delete(t0);
mpc_cleanup(18, Qscript, Comment, Resource, Rtype, Rname, InnerBlock,
Statement, Function, Parameter, Literal, Block, Seperator, Qstring,
SimpleStr, ComplexStr, Number, Float, Int);
@@ -278,6 +282,61 @@ void test_missingrule(void) {
}
void test_regex_mode(void) {
mpc_parser_t *Line0, *Line1, *Line2, *Line3;
mpc_ast_t *t0, *t1, *t2, *t3, *t4;
Line0 = mpc_new("line0");
Line1 = mpc_new("line1");
Line2 = mpc_new("line2");
Line3 = mpc_new("line3");
mpca_lang(MPCA_LANG_DEFAULT, " line0 : /.*/; ", Line0);
mpca_lang(MPCA_LANG_DEFAULT, " line1 : /.*/s; ", Line1);
mpca_lang(MPCA_LANG_DEFAULT, " line2 : /(^[a-z]*$)*/; ", Line2);
mpca_lang(MPCA_LANG_DEFAULT, " line3 : /(^[a-z]*$)*/m; ", Line3);
t0 = mpc_ast_new("regex", "blah");
t1 = mpc_ast_new("regex", "blah\nblah");
t2 = mpc_ast_new("regex", "");
t3 = mpc_ast_new("regex", "blah");
t4 = mpc_ast_new("regex", "blah\nblah");
PT_ASSERT(mpc_test_pass(Line0, "blah\nblah", t0,
(int(*)(const void*,const void*))mpc_ast_eq,
(mpc_dtor_t)mpc_ast_delete,
(void(*)(const void*))mpc_ast_print));
PT_ASSERT(mpc_test_pass(Line1, "blah\nblah", t1,
(int(*)(const void*,const void*))mpc_ast_eq,
(mpc_dtor_t)mpc_ast_delete,
(void(*)(const void*))mpc_ast_print));
PT_ASSERT(mpc_test_pass(Line2, "blah\nblah", t2,
(int(*)(const void*,const void*))mpc_ast_eq,
(mpc_dtor_t)mpc_ast_delete,
(void(*)(const void*))mpc_ast_print));
PT_ASSERT(mpc_test_pass(Line2, "blah", t3,
(int(*)(const void*,const void*))mpc_ast_eq,
(mpc_dtor_t)mpc_ast_delete,
(void(*)(const void*))mpc_ast_print));
PT_ASSERT(mpc_test_pass(Line3, "blah\nblah", t4,
(int(*)(const void*,const void*))mpc_ast_eq,
(mpc_dtor_t)mpc_ast_delete,
(void(*)(const void*))mpc_ast_print));
mpc_ast_delete(t0);
mpc_ast_delete(t1);
mpc_ast_delete(t2);
mpc_ast_delete(t3);
mpc_ast_delete(t4);
mpc_cleanup(4, Line0, Line1, Line2, Line3);
}
void suite_grammar(void) {
pt_add_test(test_grammar, "Test Grammar", "Suite Grammar");
pt_add_test(test_language, "Test Language", "Suite Grammar");
@@ -286,4 +345,5 @@ void suite_grammar(void) {
pt_add_test(test_partial, "Test Partial", "Suite Grammar");
pt_add_test(test_qscript, "Test QScript", "Suite Grammar");
pt_add_test(test_missingrule, "Test Missing Rule", "Suite Grammar");
pt_add_test(test_regex_mode, "Test Regex Mode", "Suite Grammar");
}

View File

@@ -132,6 +132,43 @@ void test_regex_newline(void) {
}
void test_regex_multiline(void) {
mpc_parser_t *re0 = mpc_re_mode("(^[a-z]*$)*", MPC_RE_MULTILINE);
PT_ASSERT(regex_test_pass(re0, "hello\nhello", "hello\nhello"));
PT_ASSERT(regex_test_pass(re0, "hello\nhello\n", "hello\nhello\n"));
PT_ASSERT(regex_test_pass(re0, "\nblah\n\nblah\n", "\nblah\n\nblah\n"));
PT_ASSERT(regex_test_fail(re0, "45234", "45234"));
PT_ASSERT(regex_test_fail(re0, "\n45234", "\n45234"));
PT_ASSERT(regex_test_pass(re0, "\n45234", "\n"));
mpc_delete(re0);
}
void test_regex_dotall(void) {
mpc_parser_t *re0 = mpc_re_mode("^.*$", MPC_RE_DEFAULT);
mpc_parser_t *re1 = mpc_re_mode("^.*$", MPC_RE_DOTALL);
PT_ASSERT(regex_test_pass(re0, "hello", "hello"));
PT_ASSERT(regex_test_fail(re0, "hello\n", "hello"));
PT_ASSERT(regex_test_fail(re0, "he\nllo\n", "he"));
PT_ASSERT(regex_test_pass(re0, "34njaksdklmasd", "34njaksdklmasd"));
PT_ASSERT(regex_test_fail(re0, "34njaksd\nklmasd", "34njaksd"));
PT_ASSERT(regex_test_pass(re1, "hello", "hello"));
PT_ASSERT(regex_test_pass(re1, "hello\n", "hello\n"));
PT_ASSERT(regex_test_pass(re1, "he\nllo\n", "he\nllo\n"));
PT_ASSERT(regex_test_pass(re1, "34njaksdklmasd", "34njaksdklmasd"));
PT_ASSERT(regex_test_pass(re1, "34njaksd\nklmasd", "34njaksd\nklmasd"));
mpc_delete(re0);
mpc_delete(re1);
}
void suite_regex(void) {
pt_add_test(test_regex_basic, "Test Regex Basic", "Suite Regex");
pt_add_test(test_regex_range, "Test Regex Range", "Suite Regex");
@@ -139,4 +176,6 @@ void suite_regex(void) {
pt_add_test(test_regex_lisp_comment, "Test Regex Lisp Comment", "Suite Regex");
pt_add_test(test_regex_boundary, "Test Regex Boundary", "Suite Regex");
pt_add_test(test_regex_newline, "Test Regex Newline", "Suite Regex");
pt_add_test(test_regex_multiline, "Test Regex Multiline", "Suite Regex");
pt_add_test(test_regex_dotall, "Test Regex Dotall", "Suite Regex");
}