Added mode option to regex and also changed example from a line reader to a tokenizer.

2018-10-14 17:20:11 -04:00
parent 95439eb9c8
commit 4a992d91ab
7 changed files with 254 additions and 59 deletions
--- a/README.md
+++ b/README.md
@@ -560,6 +560,20 @@ This function makes a copy of a parser `a`. This can be useful when you want to
 use a parser as input for some other parsers multiple times without retaining 
 it. 
 * * *
 ```c
 mpc_parser_t *mpc_re(const char *re);
 mpc_parser_t *mpc_re_mode(const char *re, int mode);
 ```
 This function takes as input the regular expression `re` and builds a parser 
 for it. With the `mpc_re_mode` function optional mode flags can also be given. 
 Available flags are `MPC_RE_MULTILINE` / `MPC_RE_M` where the start of input 
 character `^` also matches the beginning of new lines and the end of input `$` 
 character also matches new lines, and `MPC_RE_DOTALL` / `MPC_RE_S` where the 
 any character token `.` also matches newlines (by default it doesn't).
 Library Reference
 =================
@@ -573,6 +587,7 @@ Common Parsers
  <tr><td><code>mpc_soi</code></td><td>Matches only the start of input, returns <code>NULL</code></td></tr>
  <tr><td><code>mpc_eoi</code></td><td>Matches only the end of input, returns <code>NULL</code></td></tr>
  <tr><td><code>mpc_boundary</code></td><td>Matches only the boundary between words, returns <code>NULL</code></td></tr>
  <tr><td><code>mpc_boundary_newline</code></td><td>Matches the start of a new line, returns <code>NULL</code></td></tr>
  <tr><td><code>mpc_whitespace</code></td><td>Matches any whitespace character <code>" \f\n\r\t\v"</code></td></tr>
  <tr><td><code>mpc_whitespaces</code></td><td>Matches zero or more whitespace characters</td></tr>
  <tr><td><code>mpc_blank</code></td><td>Matches whitespaces and frees the result, returns <code>NULL</code></td></tr>
@@ -807,65 +822,64 @@ mpc_err_t *mpca_lang_contents(int flags, const char *filename, ...);
 This opens and reads in the contents of the file given by `filename` and passes it to `mpca_lang`.
-Case Study - Line Reader
+Case Study - Tokenizer
-========================
+======================
-Another common task we might be interested in doing is parsing a file line by line and doing something on each line we encounter. For this we can setup something like the following:
+Another common task we might be interested in doing is tokenizing some block of 
 text (splitting the text into individual elements) and performing some function
 on each one of these elements as it is read. We can do this with `mpc` too.
-First, we can build a regular expression which parses a single line: `mpc_re("[^\\n]*(\\n|$)")`, next we can add a callback function using `mpc_apply` which gets called every time a line is parsed successfully `mpc_apply(mpc_re("[^\\n]*(\\n|$)"), read_line)`. Finally we can surround all of this in `mpc_many` to parse zero or more lines. The final thing might look something like this:
+First, we can build a regular expression which parses an individual token. For 
 example if our tokens are identifiers, integers, commas, periods and colons we 
 could build something like this `mpc_re("\\s*([a-zA-Z_]+|[0-9]+|,|\\.|:)")`. 
 Next we can strip any whitespace, and add a callback function using `mpc_apply` 
 which gets called every time this regex is parsed successfully 
 `mpc_apply(mpc_strip(mpc_re("\\s*([a-zA-Z_]+|[0-9]+|,|\\.|:)")), print_token)`. 
 Finally we can surround all of this in `mpc_many` to parse it zero or more 
 times. The final code might look something like this:
 ```c
-static void* read_line(void* line) {
+static mpc_val_t *print_token(mpc_val_t *x) {
-  printf("Reading Line: %s", (char*)line);
+  printf("Token: '%s'\n", (char*)x);
-  return line;
+  return x;
 }
 int main(int argc, char **argv) {
-  const char *input = 
+  const char *input = "  hello 4352 ,  \n foo.bar   \n\n  test:ing   ";
    "abcHVwufvyuevuy3y436782\n"
    "\n"
    "\n"
    "rehre\n"
    "rew\n"
    "-ql.;qa\n"
    "eg";
-  mpc_parser_t* Line = mpc_many(
+  mpc_parser_t* Tokens = mpc_many(
-    mpcf_strfold, 
+    mpcf_all_free, 
-    mpc_apply(mpc_re("[^\\n]*(\\n|$)"), read_line));
+    mpc_apply(mpc_strip(mpc_re("\\s*([a-zA-Z_]+|[0-9]+|,|\\.|:)")), print_token));
  mpc_result_t r;
  mpc_parse("input", input, Tokens, &r);
-  mpc_parse("input", input, Line, &r);
+  mpc_delete(Tokens);
  printf("\nParsed String: %s", (char*)r.output);
  free(r.output);
  mpc_delete(Line);
  return 0;
 }
 ```
-This program will produce an output something like this:
+Running this program will produce an output something like this:
 ```
-Reading Line: abcHVwufvyuevuy3y436782
+Token: 'hello'
-Reading Line:
+Token: '4352'
-Reading Line:
+Token: ','
-Reading Line: rehre
+Token: 'foo'
-Reading Line: rew
+Token: '.'
-Reading Line: -ql.;qa
+Token: 'bar'
-Reading Line: eg
+Token: 'test'
-Parsed String: abcHVwufvyuevuy3y436782
+Token: ':'
-
+Token: 'ing'
 rehre
 rew
 -ql.;qa
 eg
 ```
 By extending the regex we can easily extend this to parse many more types of 
 tokens and quickly and easily build a tokenizer for whatever language we are
 interested in.
 Error Reporting
 ===============
--- a/mpc.c
+++ b/mpc.c
@@ -1979,7 +1979,13 @@ static int mpc_boundary_anchor(char prev, char next) {
  return 0;
 }
-mpc_parser_t *mpc_boundary(void) { return mpc_expect(mpc_anchor(mpc_boundary_anchor), "boundary"); }
+static int mpc_boundary_newline_anchor(char prev, char next) {
  (void)next;
  return prev == '\n'; 
 }
 mpc_parser_t *mpc_boundary(void) { return mpc_expect(mpc_anchor(mpc_boundary_anchor), "word boundary"); }
 mpc_parser_t *mpc_boundary_newline(void) { return mpc_expect(mpc_anchor(mpc_boundary_newline_anchor), "start of newline"); }
 mpc_parser_t *mpc_whitespace(void) { return mpc_expect(mpc_oneof(" \f\n\r\t\v"), "whitespace"); }
 mpc_parser_t *mpc_whitespaces(void) { return mpc_expect(mpc_many(mpcf_strfold, mpc_whitespace()), "spaces"); }
@@ -2192,20 +2198,45 @@ static mpc_parser_t *mpc_re_escape_char(char c) {
  }
 }
-static mpc_val_t *mpcf_re_escape(mpc_val_t *x) {
+static mpc_val_t *mpcf_re_escape(mpc_val_t *x, void* data) {
  int mode = *((int*)data);
  char *s = x;
  mpc_parser_t *p;
-  /* Regex Special Characters */
+  /* Any Character */
-  if (s[0] == '.') { free(s); return mpc_any(); }
+  if (s[0] == '.') {
-  if (s[0] == '^') { free(s); return mpc_and(2, mpcf_snd, mpc_soi(), mpc_lift(mpcf_ctor_str), free); }
+    free(s);
    if (mode & MPC_RE_DOTALL) {
      return mpc_any();      
    } else {
      return mpc_expect(mpc_noneof("\n"), "any character except a newline");
    }
  }
  /* Start of Input */
  if (s[0] == '^') {
    free(s);
    if (mode & MPC_RE_MULTILINE) {
      return mpc_and(2, mpcf_snd, mpc_or(2, mpc_soi(), mpc_boundary_newline()), mpc_lift(mpcf_ctor_str), free);
    } else {
      return mpc_and(2, mpcf_snd, mpc_soi(), mpc_lift(mpcf_ctor_str), free);
    }
  }
  /* End of Input */
  if (s[0] == '$') {
    free(s); 
    if (mode & MPC_RE_MULTILINE) {
      return mpc_or(2, 
        mpc_newline(), 
        mpc_and(2, mpcf_snd, mpc_eoi(), mpc_lift(mpcf_ctor_str), free)); 
    } else {
      return mpc_or(2, 
        mpc_and(2, mpcf_fst, mpc_newline(), mpc_eoi(), free), 
        mpc_and(2, mpcf_snd, mpc_eoi(), mpc_lift(mpcf_ctor_str), free));      
    }
  }
  /* Regex Escape */
  if (s[0] == '\\') {
@@ -2302,6 +2333,10 @@ static mpc_val_t *mpcf_re_range(mpc_val_t *x) {
 }
 mpc_parser_t *mpc_re(const char *re) {
  return mpc_re_mode(re, MPC_RE_DEFAULT);
 }
 mpc_parser_t *mpc_re_mode(const char *re, int mode) {
  char *err_msg;
  mpc_parser_t *err_out;
@@ -2334,8 +2369,8 @@ mpc_parser_t *mpc_re(const char *re) {
  mpc_define(Base, mpc_or(4,
    mpc_parens(Regex, (mpc_dtor_t)mpc_delete),
    mpc_squares(Range, (mpc_dtor_t)mpc_delete),
-    mpc_apply(mpc_escape(), mpcf_re_escape),
+    mpc_apply_to(mpc_escape(), mpcf_re_escape, &mode),
-    mpc_apply(mpc_noneof(")|"), mpcf_re_escape)
+    mpc_apply_to(mpc_noneof(")|"), mpcf_re_escape, &mode)
  ));
  mpc_define(Range, mpc_apply(
@@ -3320,7 +3355,7 @@ mpc_parser_t *mpca_total(mpc_parser_t *a) { return mpc_total(a, (mpc_dtor_t)mpc_
 **      <base> : "<" (<digits> | <ident>) ">"
 **             | <string_lit>
 **             | <char_lit>
-**             | <regex_lit>
+**             | <regex_lit> <regex_mode>
 **             | "(" <grammar> ")"
 */
@@ -3379,11 +3414,21 @@ static mpc_val_t *mpcaf_grammar_char(mpc_val_t *x, void *s) {
  return mpca_state(mpca_tag(mpc_apply(p, mpcf_str_ast), "char"));
 }
-static mpc_val_t *mpcaf_grammar_regex(mpc_val_t *x, void *s) {
+static mpc_val_t *mpcaf_fold_regex(int n, mpc_val_t **xs) {
-  mpca_grammar_st_t *st = s;
+  char *y = xs[0];
-  char *y = mpcf_unescape_regex(x);
+  char *m = xs[1];
-  mpc_parser_t *p = (st->flags & MPCA_LANG_WHITESPACE_SENSITIVE) ? mpc_re(y) : mpc_tok(mpc_re(y));
+  mpca_grammar_st_t *st = xs[2];
  mpc_parser_t *p;
  int mode = MPC_RE_DEFAULT;
  (void)n;
  if (strchr(m, 'm')) { mode |= MPC_RE_MULTILINE; }
  if (strchr(m, 's')) { mode |= MPC_RE_DOTALL; }
  y = mpcf_unescape_regex(y);
  p = (st->flags & MPCA_LANG_WHITESPACE_SENSITIVE) ? mpc_re_mode(y, mode) : mpc_tok(mpc_re_mode(y, mode));
  free(y);
  free(m);
  return mpca_state(mpca_tag(mpc_apply(p, mpcf_str_ast), "regex"));
 }
@@ -3496,7 +3541,7 @@ mpc_parser_t *mpca_grammar_st(const char *grammar, mpca_grammar_st_t *st) {
  mpc_define(Base, mpc_or(5,
    mpc_apply_to(mpc_tok(mpc_string_lit()), mpcaf_grammar_string, st),
    mpc_apply_to(mpc_tok(mpc_char_lit()),   mpcaf_grammar_char, st),
-    mpc_apply_to(mpc_tok(mpc_regex_lit()),  mpcaf_grammar_regex, st),
+    mpc_tok(mpc_and(3, mpcaf_fold_regex, mpc_regex_lit(), mpc_many(mpcf_strfold, mpc_oneof("ms")), mpc_lift_val(st), free, free)),
    mpc_apply_to(mpc_tok_braces(mpc_or(2, mpc_digits(), mpc_ident()), free), mpcaf_grammar_id, st),
    mpc_tok_parens(Grammar, mpc_soft_delete)
  ));
@@ -3658,7 +3703,7 @@ static mpc_err_t *mpca_lang_st(mpc_input_t *i, mpca_grammar_st_t *st) {
  mpc_define(Base, mpc_or(5,
    mpc_apply_to(mpc_tok(mpc_string_lit()), mpcaf_grammar_string, st),
    mpc_apply_to(mpc_tok(mpc_char_lit()),   mpcaf_grammar_char, st),
-    mpc_apply_to(mpc_tok(mpc_regex_lit()),  mpcaf_grammar_regex, st),
+    mpc_tok(mpc_and(3, mpcaf_fold_regex, mpc_regex_lit(), mpc_many(mpcf_strfold, mpc_oneof("ms")), mpc_lift_val(st), free, free)),
    mpc_apply_to(mpc_tok_braces(mpc_or(2, mpc_digits(), mpc_ident()), free), mpcaf_grammar_id, st),
    mpc_tok_parens(Grammar, mpc_soft_delete)
  ));
--- a/mpc.h
+++ b/mpc.h
@@ -156,6 +156,7 @@ mpc_parser_t *mpc_eoi(void);
 mpc_parser_t *mpc_soi(void);
 mpc_parser_t *mpc_boundary(void);
 mpc_parser_t *mpc_boundary_newline(void);
 mpc_parser_t *mpc_whitespace(void);
 mpc_parser_t *mpc_whitespaces(void);
@@ -264,7 +265,16 @@ mpc_val_t *mpcf_maths(int n, mpc_val_t** xs);
 ** Regular Expression Parsers
 */
 enum {
  MPC_RE_DEFAULT   = 0,
  MPC_RE_M         = 1,
  MPC_RE_S         = 2,
  MPC_RE_MULTILINE = 1,
  MPC_RE_DOTALL    = 2
 };
 mpc_parser_t *mpc_re(const char *re);
 mpc_parser_t *mpc_re_mode(const char *re, int mode);
 /*
 ** AST
--- a/package.json
+++ b/package.json
@@ -1,6 +1,6 @@
 {
  "name": "mpc",
-  "version": "0.8.8",
+  "version": "0.9.8",
  "repo": "orangeduck/mpc",
  "description": "A Parser Combinator library for C",
  "keywords": ["parser", "combinator", "library", "c", "mpc"],
--- a/tests/core.c
+++ b/tests/core.c
@@ -154,7 +154,7 @@ void test_copy(void) {
 static int line_count = 0;
-static void* read_line(void* line) {
+static mpc_val_t* read_line(mpc_val_t* line) {
  line_count++;
  return line;
 }
@@ -185,6 +185,32 @@ void test_reader(void) {
 }
 static int token_count = 0;
 static mpc_val_t *print_token(mpc_val_t *x) {
  printf("Token: '%s'\n", (char*)x);
  token_count++;
  return x;
 }
 void test_tokens(void) {
  mpc_parser_t* Tokens = mpc_many(
    mpcf_strfold, 
    mpc_apply(mpc_strip(mpc_re("\\s*([a-zA-Z_]+|[0-9]+|,|\\.|:)")), print_token));
  token_count = 0;
  PT_ASSERT(mpc_test_pass(Tokens, 
    "  hello 4352 ,  \n foo.bar   \n\n  test:ing   ", 
    "hello4352,foo.bartest:ing", streq, free, strprint));
  PT_ASSERT(token_count == 9);
  mpc_delete(Tokens);
 }
 void test_eoi(void) {
  mpc_parser_t* Line = mpc_re("[^\\n]*$");
@@ -203,5 +229,6 @@ void suite_core(void) {
  pt_add_test(test_repeat, "Test Repeat", "Suite Core");
  pt_add_test(test_copy,   "Test Copy",   "Suite Core");
  pt_add_test(test_reader, "Test Reader", "Suite Core");
  pt_add_test(test_tokens, "Test Tokens", "Suite Core");
  pt_add_test(test_eoi,    "Test EOI",    "Suite Core");
 }
--- a/tests/grammar.c
+++ b/tests/grammar.c
@@ -163,6 +163,8 @@ void test_partial(void) {
    (mpc_dtor_t)mpc_ast_delete, 
    (void(*)(const void*))mpc_ast_print));
  mpc_ast_delete(t0);
  mpc_cleanup(5, Line, Number, QuotedString, LinePragma, Parser);
 }
@@ -248,6 +250,8 @@ void test_qscript(void) {
    (mpc_dtor_t)mpc_ast_delete,
    (void(*)(const void*))mpc_ast_print));
  mpc_ast_delete(t0);
  mpc_cleanup(18, Qscript, Comment, Resource, Rtype, Rname, InnerBlock,
  Statement, Function, Parameter, Literal, Block, Seperator, Qstring,
  SimpleStr, ComplexStr, Number, Float, Int);
@@ -278,6 +282,61 @@ void test_missingrule(void) {
 }
 void test_regex_mode(void) {
  mpc_parser_t *Line0, *Line1, *Line2, *Line3;
  mpc_ast_t *t0, *t1, *t2, *t3, *t4;
  Line0 = mpc_new("line0");
  Line1 = mpc_new("line1");
  Line2 = mpc_new("line2");
  Line3 = mpc_new("line3");
  mpca_lang(MPCA_LANG_DEFAULT, " line0 : /.*/; ", Line0);
  mpca_lang(MPCA_LANG_DEFAULT, " line1 : /.*/s; ", Line1);
  mpca_lang(MPCA_LANG_DEFAULT, " line2 : /(^[a-z]*$)*/; ", Line2);
  mpca_lang(MPCA_LANG_DEFAULT, " line3 : /(^[a-z]*$)*/m; ", Line3);
  t0 = mpc_ast_new("regex", "blah");
  t1 = mpc_ast_new("regex", "blah\nblah");
  t2 = mpc_ast_new("regex", "");
  t3 = mpc_ast_new("regex", "blah");
  t4 = mpc_ast_new("regex", "blah\nblah");
  PT_ASSERT(mpc_test_pass(Line0, "blah\nblah", t0,
    (int(*)(const void*,const void*))mpc_ast_eq,
    (mpc_dtor_t)mpc_ast_delete,
    (void(*)(const void*))mpc_ast_print));
  PT_ASSERT(mpc_test_pass(Line1, "blah\nblah", t1,
    (int(*)(const void*,const void*))mpc_ast_eq,
    (mpc_dtor_t)mpc_ast_delete,
    (void(*)(const void*))mpc_ast_print));
  PT_ASSERT(mpc_test_pass(Line2, "blah\nblah", t2,
    (int(*)(const void*,const void*))mpc_ast_eq,
    (mpc_dtor_t)mpc_ast_delete,
    (void(*)(const void*))mpc_ast_print));
  PT_ASSERT(mpc_test_pass(Line2, "blah", t3,
    (int(*)(const void*,const void*))mpc_ast_eq,
    (mpc_dtor_t)mpc_ast_delete,
    (void(*)(const void*))mpc_ast_print));
  PT_ASSERT(mpc_test_pass(Line3, "blah\nblah", t4,
    (int(*)(const void*,const void*))mpc_ast_eq,
    (mpc_dtor_t)mpc_ast_delete,
    (void(*)(const void*))mpc_ast_print));
  mpc_ast_delete(t0);
  mpc_ast_delete(t1);
  mpc_ast_delete(t2);
  mpc_ast_delete(t3);
  mpc_ast_delete(t4);
  mpc_cleanup(4, Line0, Line1, Line2, Line3);
 }
 void suite_grammar(void) {
  pt_add_test(test_grammar, "Test Grammar", "Suite Grammar");
  pt_add_test(test_language, "Test Language", "Suite Grammar");
@@ -286,4 +345,5 @@ void suite_grammar(void) {
  pt_add_test(test_partial, "Test Partial", "Suite Grammar");
  pt_add_test(test_qscript, "Test QScript", "Suite Grammar");
  pt_add_test(test_missingrule, "Test Missing Rule", "Suite Grammar");
  pt_add_test(test_regex_mode, "Test Regex Mode", "Suite Grammar");
 }
--- a/tests/regex.c
+++ b/tests/regex.c
@@ -132,6 +132,43 @@ void test_regex_newline(void) {
 }
 void test_regex_multiline(void) {
  mpc_parser_t *re0 = mpc_re_mode("(^[a-z]*$)*", MPC_RE_MULTILINE);
  PT_ASSERT(regex_test_pass(re0, "hello\nhello", "hello\nhello"));
  PT_ASSERT(regex_test_pass(re0, "hello\nhello\n", "hello\nhello\n"));
  PT_ASSERT(regex_test_pass(re0, "\nblah\n\nblah\n", "\nblah\n\nblah\n"));
  PT_ASSERT(regex_test_fail(re0, "45234", "45234"));
  PT_ASSERT(regex_test_fail(re0, "\n45234", "\n45234"));
  PT_ASSERT(regex_test_pass(re0, "\n45234", "\n"));
  mpc_delete(re0);
 }
 void test_regex_dotall(void) {
  mpc_parser_t *re0 = mpc_re_mode("^.*$", MPC_RE_DEFAULT);
  mpc_parser_t *re1 = mpc_re_mode("^.*$", MPC_RE_DOTALL);
  PT_ASSERT(regex_test_pass(re0, "hello", "hello"));
  PT_ASSERT(regex_test_fail(re0, "hello\n", "hello"));
  PT_ASSERT(regex_test_fail(re0, "he\nllo\n", "he"));
  PT_ASSERT(regex_test_pass(re0, "34njaksdklmasd", "34njaksdklmasd"));
  PT_ASSERT(regex_test_fail(re0, "34njaksd\nklmasd", "34njaksd"));
  PT_ASSERT(regex_test_pass(re1, "hello", "hello"));
  PT_ASSERT(regex_test_pass(re1, "hello\n", "hello\n"));
  PT_ASSERT(regex_test_pass(re1, "he\nllo\n", "he\nllo\n"));
  PT_ASSERT(regex_test_pass(re1, "34njaksdklmasd", "34njaksdklmasd"));
  PT_ASSERT(regex_test_pass(re1, "34njaksd\nklmasd", "34njaksd\nklmasd"));
  mpc_delete(re0);
  mpc_delete(re1);
 }
 void suite_regex(void) {
  pt_add_test(test_regex_basic, "Test Regex Basic", "Suite Regex");
  pt_add_test(test_regex_range, "Test Regex Range", "Suite Regex");
@@ -139,4 +176,6 @@ void suite_regex(void) {
  pt_add_test(test_regex_lisp_comment, "Test Regex Lisp Comment", "Suite Regex");
  pt_add_test(test_regex_boundary, "Test Regex Boundary", "Suite Regex");
  pt_add_test(test_regex_newline, "Test Regex Newline", "Suite Regex");
  pt_add_test(test_regex_multiline, "Test Regex Multiline", "Suite Regex");
  pt_add_test(test_regex_dotall, "Test Regex Dotall", "Suite Regex");
 }