Added mode option to regex and also changed example from a line reader to a tokenizer.

2018-10-14 17:20:11 -04:00
parent 95439eb9c8
commit 4a992d91ab
7 changed files with 254 additions and 59 deletions
--- a/README.md
+++ b/README.md
@@ -560,6 +560,20 @@ This function makes a copy of a parser `a`. This can be useful when you want to
 use a parser as input for some other parsers multiple times without retaining 
 it. 

+* * *
+
+```c
+mpc_parser_t *mpc_re(const char *re);
+mpc_parser_t *mpc_re_mode(const char *re, int mode);
+```
+
+This function takes as input the regular expression `re` and builds a parser 
+for it. With the `mpc_re_mode` function optional mode flags can also be given. 
+Available flags are `MPC_RE_MULTILINE` / `MPC_RE_M` where the start of input 
+character `^` also matches the beginning of new lines and the end of input `$` 
+character also matches new lines, and `MPC_RE_DOTALL` / `MPC_RE_S` where the 
+any character token `.` also matches newlines (by default it doesn't).
+

 Library Reference
 =================
@@ -573,6 +587,7 @@ Common Parsers
  <tr><td><code>mpc_soi</code></td><td>Matches only the start of input, returns <code>NULL</code></td></tr>
  <tr><td><code>mpc_eoi</code></td><td>Matches only the end of input, returns <code>NULL</code></td></tr>
  <tr><td><code>mpc_boundary</code></td><td>Matches only the boundary between words, returns <code>NULL</code></td></tr>
+  <tr><td><code>mpc_boundary_newline</code></td><td>Matches the start of a new line, returns <code>NULL</code></td></tr>
  <tr><td><code>mpc_whitespace</code></td><td>Matches any whitespace character <code>" \f\n\r\t\v"</code></td></tr>
  <tr><td><code>mpc_whitespaces</code></td><td>Matches zero or more whitespace characters</td></tr>
  <tr><td><code>mpc_blank</code></td><td>Matches whitespaces and frees the result, returns <code>NULL</code></td></tr>
@@ -807,65 +822,64 @@ mpc_err_t *mpca_lang_contents(int flags, const char *filename, ...);

 This opens and reads in the contents of the file given by `filename` and passes it to `mpca_lang`.

-Case Study - Line Reader
-========================
+Case Study - Tokenizer
+======================

-Another common task we might be interested in doing is parsing a file line by line and doing something on each line we encounter. For this we can setup something like the following:
+Another common task we might be interested in doing is tokenizing some block of 
+text (splitting the text into individual elements) and performing some function
+on each one of these elements as it is read. We can do this with `mpc` too.

-First, we can build a regular expression which parses a single line: `mpc_re("[^\\n]*(\\n|$)")`, next we can add a callback function using `mpc_apply` which gets called every time a line is parsed successfully `mpc_apply(mpc_re("[^\\n]*(\\n|$)"), read_line)`. Finally we can surround all of this in `mpc_many` to parse zero or more lines. The final thing might look something like this:
+First, we can build a regular expression which parses an individual token. For 
+example if our tokens are identifiers, integers, commas, periods and colons we 
+could build something like this `mpc_re("\\s*([a-zA-Z_]+|[0-9]+|,|\\.|:)")`. 
+Next we can strip any whitespace, and add a callback function using `mpc_apply` 
+which gets called every time this regex is parsed successfully 
+`mpc_apply(mpc_strip(mpc_re("\\s*([a-zA-Z_]+|[0-9]+|,|\\.|:)")), print_token)`. 
+Finally we can surround all of this in `mpc_many` to parse it zero or more 
+times. The final code might look something like this:

 ```c
-static void* read_line(void* line) {
-  printf("Reading Line: %s", (char*)line);
-  return line;
+static mpc_val_t *print_token(mpc_val_t *x) {
+  printf("Token: '%s'\n", (char*)x);
+  return x;
 }

 int main(int argc, char **argv) {

-  const char *input = 
-    "abcHVwufvyuevuy3y436782\n"
-    "\n"
-    "\n"
-    "rehre\n"
-    "rew\n"
-    "-ql.;qa\n"
-    "eg";
+  const char *input = "  hello 4352 ,  \n foo.bar   \n\n  test:ing   ";
  
-  mpc_parser_t* Line = mpc_many(
-    mpcf_strfold, 
-    mpc_apply(mpc_re("[^\\n]*(\\n|$)"), read_line));
+  mpc_parser_t* Tokens = mpc_many(
+    mpcf_all_free, 
+    mpc_apply(mpc_strip(mpc_re("\\s*([a-zA-Z_]+|[0-9]+|,|\\.|:)")), print_token));
  
  mpc_result_t r;
+  mpc_parse("input", input, Tokens, &r);
  
-  mpc_parse("input", input, Line, &r);
-  printf("\nParsed String: %s", (char*)r.output);
-  free(r.output);
-  
-  mpc_delete(Line);
+  mpc_delete(Tokens);
  
  return 0;
 }
 ```

-This program will produce an output something like this:
+Running this program will produce an output something like this:

 ```
-Reading Line: abcHVwufvyuevuy3y436782
-Reading Line:
-Reading Line:
-Reading Line: rehre
-Reading Line: rew
-Reading Line: -ql.;qa
-Reading Line: eg
-Parsed String: abcHVwufvyuevuy3y436782
-
-
-rehre
-rew
-ql.;qa
-eg
+Token: 'hello'
+Token: '4352'
+Token: ','
+Token: 'foo'
+Token: '.'
+Token: 'bar'
+Token: 'test'
+Token: ':'
+Token: 'ing'
 ```

+By extending the regex we can easily extend this to parse many more types of 
+tokens and quickly and easily build a tokenizer for whatever language we are
+interested in.
+
+
 Error Reporting
 ===============

--- a/mpc.c
+++ b/mpc.c
@@ -1979,7 +1979,13 @@ static int mpc_boundary_anchor(char prev, char next) {
  return 0;
 }

-mpc_parser_t *mpc_boundary(void) { return mpc_expect(mpc_anchor(mpc_boundary_anchor), "boundary"); }
+static int mpc_boundary_newline_anchor(char prev, char next) {
+  (void)next;
+  return prev == '\n'; 
+}
+
+mpc_parser_t *mpc_boundary(void) { return mpc_expect(mpc_anchor(mpc_boundary_anchor), "word boundary"); }
+mpc_parser_t *mpc_boundary_newline(void) { return mpc_expect(mpc_anchor(mpc_boundary_newline_anchor), "start of newline"); }

 mpc_parser_t *mpc_whitespace(void) { return mpc_expect(mpc_oneof(" \f\n\r\t\v"), "whitespace"); }
 mpc_parser_t *mpc_whitespaces(void) { return mpc_expect(mpc_many(mpcf_strfold, mpc_whitespace()), "spaces"); }
@@ -2192,19 +2198,44 @@ static mpc_parser_t *mpc_re_escape_char(char c) {
  }
 }

-static mpc_val_t *mpcf_re_escape(mpc_val_t *x) {
+static mpc_val_t *mpcf_re_escape(mpc_val_t *x, void* data) {
  
+  int mode = *((int*)data);
  char *s = x;
  mpc_parser_t *p;
  
-  /* Regex Special Characters */
-  if (s[0] == '.') { free(s); return mpc_any(); }
-  if (s[0] == '^') { free(s); return mpc_and(2, mpcf_snd, mpc_soi(), mpc_lift(mpcf_ctor_str), free); }
+  /* Any Character */
+  if (s[0] == '.') {
+    free(s);
+    if (mode & MPC_RE_DOTALL) {
+      return mpc_any();      
+    } else {
+      return mpc_expect(mpc_noneof("\n"), "any character except a newline");
+    }
+  }
+  
+  /* Start of Input */
+  if (s[0] == '^') {
+    free(s);
+    if (mode & MPC_RE_MULTILINE) {
+      return mpc_and(2, mpcf_snd, mpc_or(2, mpc_soi(), mpc_boundary_newline()), mpc_lift(mpcf_ctor_str), free);
+    } else {
+      return mpc_and(2, mpcf_snd, mpc_soi(), mpc_lift(mpcf_ctor_str), free);
+    }
+  }
+  
+  /* End of Input */
  if (s[0] == '$') {
    free(s); 
-    return mpc_or(2, 
-      mpc_and(2, mpcf_fst, mpc_newline(), mpc_eoi(), free), 
-      mpc_and(2, mpcf_snd, mpc_eoi(), mpc_lift(mpcf_ctor_str), free));
+    if (mode & MPC_RE_MULTILINE) {
+      return mpc_or(2, 
+        mpc_newline(), 
+        mpc_and(2, mpcf_snd, mpc_eoi(), mpc_lift(mpcf_ctor_str), free)); 
+    } else {
+      return mpc_or(2, 
+        mpc_and(2, mpcf_fst, mpc_newline(), mpc_eoi(), free), 
+        mpc_and(2, mpcf_snd, mpc_eoi(), mpc_lift(mpcf_ctor_str), free));      
+    }
  }
  
  /* Regex Escape */
@@ -2302,6 +2333,10 @@ static mpc_val_t *mpcf_re_range(mpc_val_t *x) {
 }

 mpc_parser_t *mpc_re(const char *re) {
+  return mpc_re_mode(re, MPC_RE_DEFAULT);
+}
+
+mpc_parser_t *mpc_re_mode(const char *re, int mode) {
  
  char *err_msg;
  mpc_parser_t *err_out;
@@ -2334,8 +2369,8 @@ mpc_parser_t *mpc_re(const char *re) {
  mpc_define(Base, mpc_or(4,
    mpc_parens(Regex, (mpc_dtor_t)mpc_delete),
    mpc_squares(Range, (mpc_dtor_t)mpc_delete),
-    mpc_apply(mpc_escape(), mpcf_re_escape),
-    mpc_apply(mpc_noneof(")|"), mpcf_re_escape)
+    mpc_apply_to(mpc_escape(), mpcf_re_escape, &mode),
+    mpc_apply_to(mpc_noneof(")|"), mpcf_re_escape, &mode)
  ));
  
  mpc_define(Range, mpc_apply(
@@ -3320,7 +3355,7 @@ mpc_parser_t *mpca_total(mpc_parser_t *a) { return mpc_total(a, (mpc_dtor_t)mpc_
 **      <base> : "<" (<digits> | <ident>) ">"
 **             | <string_lit>
 **             | <char_lit>
-**             | <regex_lit>
+**             | <regex_lit> <regex_mode>
 **             | "(" <grammar> ")"
 */

@@ -3379,11 +3414,21 @@ static mpc_val_t *mpcaf_grammar_char(mpc_val_t *x, void *s) {
  return mpca_state(mpca_tag(mpc_apply(p, mpcf_str_ast), "char"));
 }

-static mpc_val_t *mpcaf_grammar_regex(mpc_val_t *x, void *s) {
-  mpca_grammar_st_t *st = s;
-  char *y = mpcf_unescape_regex(x);
-  mpc_parser_t *p = (st->flags & MPCA_LANG_WHITESPACE_SENSITIVE) ? mpc_re(y) : mpc_tok(mpc_re(y));
+static mpc_val_t *mpcaf_fold_regex(int n, mpc_val_t **xs) {
+  char *y = xs[0];
+  char *m = xs[1];
+  mpca_grammar_st_t *st = xs[2];
+  mpc_parser_t *p;
+  int mode = MPC_RE_DEFAULT;
+  
+  (void)n;
+  if (strchr(m, 'm')) { mode |= MPC_RE_MULTILINE; }
+  if (strchr(m, 's')) { mode |= MPC_RE_DOTALL; }
+  y = mpcf_unescape_regex(y);
+  p = (st->flags & MPCA_LANG_WHITESPACE_SENSITIVE) ? mpc_re_mode(y, mode) : mpc_tok(mpc_re_mode(y, mode));
  free(y);
+  free(m);
+  
  return mpca_state(mpca_tag(mpc_apply(p, mpcf_str_ast), "regex"));
 }

@@ -3496,7 +3541,7 @@ mpc_parser_t *mpca_grammar_st(const char *grammar, mpca_grammar_st_t *st) {
  mpc_define(Base, mpc_or(5,
    mpc_apply_to(mpc_tok(mpc_string_lit()), mpcaf_grammar_string, st),
    mpc_apply_to(mpc_tok(mpc_char_lit()),   mpcaf_grammar_char, st),
-    mpc_apply_to(mpc_tok(mpc_regex_lit()),  mpcaf_grammar_regex, st),
+    mpc_tok(mpc_and(3, mpcaf_fold_regex, mpc_regex_lit(), mpc_many(mpcf_strfold, mpc_oneof("ms")), mpc_lift_val(st), free, free)),
    mpc_apply_to(mpc_tok_braces(mpc_or(2, mpc_digits(), mpc_ident()), free), mpcaf_grammar_id, st),
    mpc_tok_parens(Grammar, mpc_soft_delete)
  ));
@@ -3658,7 +3703,7 @@ static mpc_err_t *mpca_lang_st(mpc_input_t *i, mpca_grammar_st_t *st) {
  mpc_define(Base, mpc_or(5,
    mpc_apply_to(mpc_tok(mpc_string_lit()), mpcaf_grammar_string, st),
    mpc_apply_to(mpc_tok(mpc_char_lit()),   mpcaf_grammar_char, st),
-    mpc_apply_to(mpc_tok(mpc_regex_lit()),  mpcaf_grammar_regex, st),
+    mpc_tok(mpc_and(3, mpcaf_fold_regex, mpc_regex_lit(), mpc_many(mpcf_strfold, mpc_oneof("ms")), mpc_lift_val(st), free, free)),
    mpc_apply_to(mpc_tok_braces(mpc_or(2, mpc_digits(), mpc_ident()), free), mpcaf_grammar_id, st),
    mpc_tok_parens(Grammar, mpc_soft_delete)
  ));
--- a/mpc.h
+++ b/mpc.h
@@ -156,6 +156,7 @@ mpc_parser_t *mpc_eoi(void);
 mpc_parser_t *mpc_soi(void);

 mpc_parser_t *mpc_boundary(void);
+mpc_parser_t *mpc_boundary_newline(void);

 mpc_parser_t *mpc_whitespace(void);
 mpc_parser_t *mpc_whitespaces(void);
@@ -264,7 +265,16 @@ mpc_val_t *mpcf_maths(int n, mpc_val_t** xs);
 ** Regular Expression Parsers
 */

+enum {
+  MPC_RE_DEFAULT   = 0,
+  MPC_RE_M         = 1,
+  MPC_RE_S         = 2,
+  MPC_RE_MULTILINE = 1,
+  MPC_RE_DOTALL    = 2
+};
+
 mpc_parser_t *mpc_re(const char *re);
+mpc_parser_t *mpc_re_mode(const char *re, int mode);
  
 /*
 ** AST
--- a/package.json
+++ b/package.json
@@ -1,6 +1,6 @@
 {
  "name": "mpc",
-  "version": "0.8.8",
+  "version": "0.9.8",
  "repo": "orangeduck/mpc",
  "description": "A Parser Combinator library for C",
  "keywords": ["parser", "combinator", "library", "c", "mpc"],
--- a/tests/core.c
+++ b/tests/core.c
@@ -154,7 +154,7 @@ void test_copy(void) {

 static int line_count = 0;

-static void* read_line(void* line) {
+static mpc_val_t* read_line(mpc_val_t* line) {
  line_count++;
  return line;
 }
@@ -185,6 +185,32 @@ void test_reader(void) {

 }

+static int token_count = 0;
+
+static mpc_val_t *print_token(mpc_val_t *x) {
+  printf("Token: '%s'\n", (char*)x);
+  token_count++;
+  return x;
+}
+
+void test_tokens(void) {
+  
+  mpc_parser_t* Tokens = mpc_many(
+    mpcf_strfold, 
+    mpc_apply(mpc_strip(mpc_re("\\s*([a-zA-Z_]+|[0-9]+|,|\\.|:)")), print_token));
+  
+  token_count = 0;
+
+  PT_ASSERT(mpc_test_pass(Tokens, 
+    "  hello 4352 ,  \n foo.bar   \n\n  test:ing   ", 
+    "hello4352,foo.bartest:ing", streq, free, strprint));
+  
+  PT_ASSERT(token_count == 9);
+
+  mpc_delete(Tokens);
+  
+}
+
 void test_eoi(void) {
  
  mpc_parser_t* Line = mpc_re("[^\\n]*$");
@@ -203,5 +229,6 @@ void suite_core(void) {
  pt_add_test(test_repeat, "Test Repeat", "Suite Core");
  pt_add_test(test_copy,   "Test Copy",   "Suite Core");
  pt_add_test(test_reader, "Test Reader", "Suite Core");
+  pt_add_test(test_tokens, "Test Tokens", "Suite Core");
  pt_add_test(test_eoi,    "Test EOI",    "Suite Core");
 }
--- a/tests/grammar.c
+++ b/tests/grammar.c
@@ -163,6 +163,8 @@ void test_partial(void) {
    (mpc_dtor_t)mpc_ast_delete, 
    (void(*)(const void*))mpc_ast_print));
    
+  mpc_ast_delete(t0);
+
  mpc_cleanup(5, Line, Number, QuotedString, LinePragma, Parser);

 }
@@ -248,6 +250,8 @@ void test_qscript(void) {
    (mpc_dtor_t)mpc_ast_delete,
    (void(*)(const void*))mpc_ast_print));
  
+  mpc_ast_delete(t0);
+
  mpc_cleanup(18, Qscript, Comment, Resource, Rtype, Rname, InnerBlock,
  Statement, Function, Parameter, Literal, Block, Seperator, Qstring,
  SimpleStr, ComplexStr, Number, Float, Int);
@@ -278,6 +282,61 @@ void test_missingrule(void) {

 }

+void test_regex_mode(void) {
+  
+  mpc_parser_t *Line0, *Line1, *Line2, *Line3;
+  mpc_ast_t *t0, *t1, *t2, *t3, *t4;
+  
+  Line0 = mpc_new("line0");
+  Line1 = mpc_new("line1");
+  Line2 = mpc_new("line2");
+  Line3 = mpc_new("line3");
+  
+  mpca_lang(MPCA_LANG_DEFAULT, " line0 : /.*/; ", Line0);
+  mpca_lang(MPCA_LANG_DEFAULT, " line1 : /.*/s; ", Line1);
+  mpca_lang(MPCA_LANG_DEFAULT, " line2 : /(^[a-z]*$)*/; ", Line2);
+  mpca_lang(MPCA_LANG_DEFAULT, " line3 : /(^[a-z]*$)*/m; ", Line3);
+  
+  t0 = mpc_ast_new("regex", "blah");
+  t1 = mpc_ast_new("regex", "blah\nblah");
+  t2 = mpc_ast_new("regex", "");
+  t3 = mpc_ast_new("regex", "blah");
+  t4 = mpc_ast_new("regex", "blah\nblah");
+  
+  PT_ASSERT(mpc_test_pass(Line0, "blah\nblah", t0,
+    (int(*)(const void*,const void*))mpc_ast_eq,
+    (mpc_dtor_t)mpc_ast_delete,
+    (void(*)(const void*))mpc_ast_print));
+  
+  PT_ASSERT(mpc_test_pass(Line1, "blah\nblah", t1,
+    (int(*)(const void*,const void*))mpc_ast_eq,
+    (mpc_dtor_t)mpc_ast_delete,
+    (void(*)(const void*))mpc_ast_print));
+  
+  PT_ASSERT(mpc_test_pass(Line2, "blah\nblah", t2,
+    (int(*)(const void*,const void*))mpc_ast_eq,
+    (mpc_dtor_t)mpc_ast_delete,
+    (void(*)(const void*))mpc_ast_print));
+
+  PT_ASSERT(mpc_test_pass(Line2, "blah", t3,
+    (int(*)(const void*,const void*))mpc_ast_eq,
+    (mpc_dtor_t)mpc_ast_delete,
+    (void(*)(const void*))mpc_ast_print));
+    
+  PT_ASSERT(mpc_test_pass(Line3, "blah\nblah", t4,
+    (int(*)(const void*,const void*))mpc_ast_eq,
+    (mpc_dtor_t)mpc_ast_delete,
+    (void(*)(const void*))mpc_ast_print));
+  
+  mpc_ast_delete(t0);
+  mpc_ast_delete(t1);
+  mpc_ast_delete(t2);
+  mpc_ast_delete(t3);
+  mpc_ast_delete(t4);
+  
+  mpc_cleanup(4, Line0, Line1, Line2, Line3);
+}
+
 void suite_grammar(void) {
  pt_add_test(test_grammar, "Test Grammar", "Suite Grammar");
  pt_add_test(test_language, "Test Language", "Suite Grammar");
@@ -286,4 +345,5 @@ void suite_grammar(void) {
  pt_add_test(test_partial, "Test Partial", "Suite Grammar");
  pt_add_test(test_qscript, "Test QScript", "Suite Grammar");
  pt_add_test(test_missingrule, "Test Missing Rule", "Suite Grammar");
+  pt_add_test(test_regex_mode, "Test Regex Mode", "Suite Grammar");
 }
--- a/tests/regex.c
+++ b/tests/regex.c
@@ -132,6 +132,43 @@ void test_regex_newline(void) {
  
 }

+void test_regex_multiline(void) {
+
+  mpc_parser_t *re0 = mpc_re_mode("(^[a-z]*$)*", MPC_RE_MULTILINE);
+
+  PT_ASSERT(regex_test_pass(re0, "hello\nhello", "hello\nhello"));
+  PT_ASSERT(regex_test_pass(re0, "hello\nhello\n", "hello\nhello\n"));
+  PT_ASSERT(regex_test_pass(re0, "\nblah\n\nblah\n", "\nblah\n\nblah\n"));
+  PT_ASSERT(regex_test_fail(re0, "45234", "45234"));
+  PT_ASSERT(regex_test_fail(re0, "\n45234", "\n45234"));
+  PT_ASSERT(regex_test_pass(re0, "\n45234", "\n"));
+  
+  mpc_delete(re0);
+  
+}
+
+void test_regex_dotall(void) {
+  
+  mpc_parser_t *re0 = mpc_re_mode("^.*$", MPC_RE_DEFAULT);
+  mpc_parser_t *re1 = mpc_re_mode("^.*$", MPC_RE_DOTALL);
+
+  PT_ASSERT(regex_test_pass(re0, "hello", "hello"));
+  PT_ASSERT(regex_test_fail(re0, "hello\n", "hello"));
+  PT_ASSERT(regex_test_fail(re0, "he\nllo\n", "he"));
+  PT_ASSERT(regex_test_pass(re0, "34njaksdklmasd", "34njaksdklmasd"));
+  PT_ASSERT(regex_test_fail(re0, "34njaksd\nklmasd", "34njaksd"));
+  
+  PT_ASSERT(regex_test_pass(re1, "hello", "hello"));
+  PT_ASSERT(regex_test_pass(re1, "hello\n", "hello\n"));
+  PT_ASSERT(regex_test_pass(re1, "he\nllo\n", "he\nllo\n"));
+  PT_ASSERT(regex_test_pass(re1, "34njaksdklmasd", "34njaksdklmasd"));
+  PT_ASSERT(regex_test_pass(re1, "34njaksd\nklmasd", "34njaksd\nklmasd"));
+  
+  mpc_delete(re0);
+  mpc_delete(re1);
+  
+}
+
 void suite_regex(void) {
  pt_add_test(test_regex_basic, "Test Regex Basic", "Suite Regex");
  pt_add_test(test_regex_range, "Test Regex Range", "Suite Regex");
@@ -139,4 +176,6 @@ void suite_regex(void) {
  pt_add_test(test_regex_lisp_comment, "Test Regex Lisp Comment", "Suite Regex");
  pt_add_test(test_regex_boundary, "Test Regex Boundary", "Suite Regex");
  pt_add_test(test_regex_newline, "Test Regex Newline", "Suite Regex");
+  pt_add_test(test_regex_multiline, "Test Regex Multiline", "Suite Regex");
+  pt_add_test(test_regex_dotall, "Test Regex Dotall", "Suite Regex");
 }