commit c736ef5b1b30924643bb57d812979b0d91e7586e Author: ranchuan Date: Fri Nov 29 19:11:43 2024 +0800 实现python版本的c语言词法分析 diff --git a/.gitignore b/.gitignore new file mode 100644 index 0000000..ba0430d --- /dev/null +++ b/.gitignore @@ -0,0 +1 @@ +__pycache__/ \ No newline at end of file diff --git a/lex_c.py b/lex_c.py new file mode 100644 index 0000000..819450a --- /dev/null +++ b/lex_c.py @@ -0,0 +1,244 @@ +import os +import sys +import dataclasses + + +TOKEN_IF = 256, +TOKEN_BREAK = 257, +TOKEN_WHILE=258, +TOKEN_SWITCH=259, +TOKEN_CASE=260, +TOKEN_DO=261, +TOKEN_CHAR=262, +TOKEN_INT=263, +TOKEN_VOID=264, +TOKEN_SYMBOL = 265 , +TOKEN_NUM = 266 ,# 数字 +TOKEN_INC = 267,# 自增 +TOKEN_DEC = 268,# 自减 +TOKEN_EQ = 269,# 相等 +TOKEN_NEQ = 270,# 不相等 +TOKEN_LSH = 271,# 左移 +TOKEN_RSH = 272,# 右移 +TOKEN_LEQ = 273,# 小于等于 +TOKEN_GEQ = 274,# 大于等于 +TOKEN_ELSE = 275, +TOKEN_CONTINUE = 276 , +TOKEN_CONST = 277 , +TOKEN_STATIC = 278 , +TOKEN_UNSIGNED = 279 , +TOKEN_TYPEDEF = 280 , +TOKEN_STRUCT = 281 , +TOKEN_ENUM = 282 , +TOKEN_UNION = 283, +TOKEN_STRING = 284, +TOKEN_DEFAULT = 285, +TOKEN_RETURN = 286, + +def TOKEN(t:str): + return t.encode("utf-8")[0] + +_KeyWordTable={ + "if":TOKEN_IF, + "else":TOKEN_ELSE, + "break":TOKEN_BREAK, + "while":TOKEN_WHILE, + "switch":TOKEN_SWITCH, + "case":TOKEN_CASE, + "do":TOKEN_DO, + "char":TOKEN_CHAR, + "int":TOKEN_INT, + "void":TOKEN_VOID, + "continue":TOKEN_CONTINUE, + "const":TOKEN_CONST, + "static":TOKEN_STATIC, + "unisgned":TOKEN_UNSIGNED, + "typedef":TOKEN_TYPEDEF, + "struct":TOKEN_STRUCT, + "enum":TOKEN_ENUM, + "union":TOKEN_UNION, + "default":TOKEN_DEFAULT, + "return":TOKEN_RETURN, +} + +_MarkTable={ + "<<":TOKEN_LSH, + ">>":TOKEN_RSH, + "<=":TOKEN_LEQ, + ">=":TOKEN_GEQ, + "!=":TOKEN_NEQ, + "==":TOKEN_EQ, + "++":TOKEN_INC, + "--":TOKEN_DEC, + "=":TOKEN("="), + "!":TOKEN("!"), + "<":TOKEN("<"), + ">":TOKEN(">"), + "+":TOKEN("+"), + "-":TOKEN("-"), + +} + + + +# 是否是数字加字母 +def isalnum(num:int): + return bytes([num]).isalnum() + +# 是否是数字加字母或下划线 +def isalnum_(num:int): + return bytes([num]).isalnum() or num==TOKEN("_") + +# 是否是字母 +def isalpha(num:int): + return bytes([num]).isalpha() + +# 是否是字母或下划线 +def isalpha_(num:int): + return bytes([num]).isalpha() or num==TOKEN("_") + +# 是否是数字 +def isdigit(num:int): + return bytes([num]).isdigit() + +# 是否是数字或小数点 +def isdigitdot(num:int): + return bytes([num]).isdigit() or num==TOKEN(".") + +# 是否是空白字符 包括换行符 +def isspace(num:int): + return bytes([num]).isspace() + +# 是否是给定字符串之一 +def isinstr(num:int,t:str): + c=bytes([num]) + return c in t.encode("utf-8") + +# 是否是操作符 +def isoperator(num:int): + return isinstr(num,"<>!+-=") + +@dataclasses.dataclass +class lex_token: + name:str + buff:bytearray + token:int + line:int + pos:int + + +class lex_class(object): + def __init__(self,text:bytes) -> None: + self.text=text + self.index=-1 + self.line=1 + self.pos=-1 + self.token_list:list[lex_token]=[] + self.token_buff=bytearray() + def save_char(self,c:int): + self.token_buff.append(c&0xff) + def save_token(self,token:lex_token): + self.token_list.append(token) + self.token_buff=bytearray() + def _get_char(self): + if(self.index=len(self.text) + def save_one_char_token(self,c:int): + token=lex_token(bytes([c]).decode("utf-8"),bytes([c]),c,self.line,self.pos) + self.save_token(token) + def read_name_and_save(self,c:int): + token=lex_token("symbol",bytearray(),TOKEN_SYMBOL,self.line,self.pos) + self.save_char(c) + while True: + c=self.get_next_char() + if(isalnum_(c)): + self.save_char(c) + else: + break + name=self.token_buff.decode("utf-8") + if(name in _KeyWordTable): + token.token=_KeyWordTable[name] + token.name=name + token.buff=self.token_buff + self.save_token(token) + return c + def read_operator_and_save(self,c:int): + token=lex_token("operator",bytearray(),TOKEN_SYMBOL,self.line,self.pos) + self.save_char(c) + while True: + c=self.get_next_char() + if(isoperator(c)): + self.save_char(c) + else: + break + name=self.token_buff.decode("utf-8") + if(name in _MarkTable): + token.token=_MarkTable[name] + token.name=name + else: + raise Exception(f"不存在的操作符 {name} ") + token.buff=self.token_buff + self.save_token(token) + return c + def read_num_and_save(self,c:int): + token=lex_token("number",bytearray(),TOKEN_NUM,self.line,self.pos) + self.save_char(c) + while True: + c=self.get_next_char() + if(isdigitdot(c)): + self.save_char(c) + else: + break + if(self.token_buff.count(b'.')>1): + raise Exception("数字不能包含多个点号") + token.buff=self.token_buff + self.save_token(token) + return c + def read_str_and_save(self,c:int): + c=self.get_next_char() + while c!=b'\"'[0]: + self.save_char(c) + c=self.get_next_char() + self.save_token(lex_token("string",self.token_buff,TOKEN_STRING,self.line,self.pos)) + return self.get_next_char() + +def lex(text:bytes): + lex_obj = lex_class(text) + c=lex_obj.get_next_char() + while not lex_obj.is_end(): + if isalpha_(c): + c=lex_obj.read_name_and_save(c) + elif isinstr(c,"{}[]()~,;:*"): + lex_obj.save_one_char_token(c) + c=lex_obj.get_next_char() + elif isdigit(c): + c=lex_obj.read_num_and_save(c) + elif isspace(c): + c=lex_obj.get_next_char() + elif isoperator(c): + c=lex_obj.read_operator_and_save(c) + elif isinstr(c,"\""): + c=lex_obj.read_str_and_save(c) + else: + raise Exception(f"err char {bytes([c])} at line:{lex_obj.line} pos:{lex_obj.pos}") + # for item in lex_obj.token_list: + # print(f"{item}") + return lex_obj.token_list + +if __name__ == "__main__": + with open("main.c",mode='rb') as f: + lex(f.read()) diff --git a/main.c b/main.c new file mode 100644 index 0000000..998cf87 --- /dev/null +++ b/main.c @@ -0,0 +1,25 @@ + + + + +const char* get_type(int s) { + const char* ret; + switch (s) + { + case 1: + case 2: + case 3: + case 4: + case 5: + case 6: + case 7: + ret = "yes"; + break; + + default: + ret = "no"; + break; + } + return ret; +} + diff --git a/parser_c.py b/parser_c.py new file mode 100644 index 0000000..11b75ac --- /dev/null +++ b/parser_c.py @@ -0,0 +1,30 @@ +import os +import sys +import dataclasses +from lex_c import lex_token +from lex_c import lex + + + +@dataclasses.dataclass +class node: + name:str + next:None + chid:None + token_list:list[lex_token] + +# 变量声明节点 +@dataclasses.dataclass +class node_vdecl(node): + vvalue:None + vtype:str + vattr:list[str] + +# 函数定义节点 +@dataclasses.dataclass +class node_fdef(node): + rettype:str + +if __name__ == "__main__": + with open("main.c",mode='rb') as f: + token_list=lex(f.read())