From e67ded6cc9a413cd1cc5e580495d3a4ea6f939ef Mon Sep 17 00:00:00 2001 From: ranchuan Date: Wed, 15 Jan 2025 18:32:00 +0800 Subject: [PATCH] =?UTF-8?q?=E4=BF=AE=E5=A4=8D=E4=B8=80=E4=BA=9B=E7=AC=A6?= =?UTF-8?q?=E5=8F=B7=E4=B8=8D=E8=AF=86=E5=88=AB=E7=9A=84=E9=97=AE=E9=A2=98?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- lex_c.py | 287 ----------------- node_declear.py | 128 -------- parser_c.py | 811 ++++++++++++++++++++++++++++++++++++++++-------- 3 files changed, 684 insertions(+), 542 deletions(-) delete mode 100644 lex_c.py delete mode 100644 node_declear.py diff --git a/lex_c.py b/lex_c.py deleted file mode 100644 index b2e2192..0000000 --- a/lex_c.py +++ /dev/null @@ -1,287 +0,0 @@ -import os -import sys -import dataclasses - - -TOKEN_IF = 256, -TOKEN_BREAK = 257, -TOKEN_WHILE=258, -TOKEN_SWITCH=259, -TOKEN_CASE=260, -TOKEN_DO=261, -TOKEN_CHAR=262, -TOKEN_INT=263, -TOKEN_VOID=264, -TOKEN_SYMBOL = 265 , -TOKEN_NUM = 266 ,# 数字 -TOKEN_INC = 267,# 自增 -TOKEN_DEC = 268,# 自减 -TOKEN_EQ = 269,# 相等 -TOKEN_NEQ = 270,# 不相等 -TOKEN_LSH = 271,# 左移 -TOKEN_RSH = 272,# 右移 -TOKEN_LEQ = 273,# 小于等于 -TOKEN_GEQ = 274,# 大于等于 -TOKEN_ELSE = 275, -TOKEN_CONTINUE = 276 , -TOKEN_CONST = 277 , -TOKEN_STATIC = 278 , -TOKEN_UNSIGNED = 279 , -TOKEN_TYPEDEF = 280 , -TOKEN_STRUCT = 281 , -TOKEN_ENUM = 282 , -TOKEN_UNION = 283, -TOKEN_STRING = 284, -TOKEN_DEFAULT = 285, -TOKEN_RETURN = 286, -TOKEN_ASSIG_ADD = 287 -TOKEN_ASSIG_SUB = 288 -TOKEN_ASSIG_MUL = 289 -TOKEN_ASSIG_DIV = 290 -TOKEN_ASSIG_LSH = 291 -TOKEN_ASSIG_RSH = 292 -TOKEN_EXTERN = 293 -TOKEN_FLOAT = 294 -TOKEN_DOUBLE = 295 -TOKEN_SHORT = 296 -TOKEN_LONG = 297 - - -def TOKEN(t:str): - return t.encode("utf-8")[0] - -_KeyWordTable={ - "if":TOKEN_IF, - "else":TOKEN_ELSE, - "break":TOKEN_BREAK, - "while":TOKEN_WHILE, - "switch":TOKEN_SWITCH, - "case":TOKEN_CASE, - "do":TOKEN_DO, - "char":TOKEN_CHAR, - "int":TOKEN_INT, - "void":TOKEN_VOID, - "continue":TOKEN_CONTINUE, - "const":TOKEN_CONST, - "static":TOKEN_STATIC, - "unisgned":TOKEN_UNSIGNED, - "typedef":TOKEN_TYPEDEF, - "struct":TOKEN_STRUCT, - "enum":TOKEN_ENUM, - "union":TOKEN_UNION, - "default":TOKEN_DEFAULT, - "return":TOKEN_RETURN, - "extern":TOKEN_EXTERN, - "float":TOKEN_FLOAT, - "double":TOKEN_DOUBLE, - "short":TOKEN_SHORT, - "long":TOKEN_LONG, -} - -_MarkTable={ - "<<":TOKEN_LSH, - ">>":TOKEN_RSH, - "<=":TOKEN_LEQ, - ">=":TOKEN_GEQ, - "!=":TOKEN_NEQ, - "==":TOKEN_EQ, - "++":TOKEN_INC, - "--":TOKEN_DEC, - "+=":TOKEN_ASSIG_ADD, - "-=":TOKEN_ASSIG_SUB, - "*=":TOKEN_ASSIG_MUL, - "<<=":TOKEN_ASSIG_LSH, - ">>=":TOKEN_ASSIG_RSH, - "=":TOKEN("="), - "!":TOKEN("!"), - "<":TOKEN("<"), - ">":TOKEN(">"), - "+":TOKEN("+"), - "-":TOKEN("-"), - -} - - - -# 是否是数字加字母 -def isalnum(num:int): - return bytes([num]).isalnum() - -# 是否是数字加字母或下划线 -def isalnum_(num:int): - return bytes([num]).isalnum() or num==TOKEN("_") - -# 是否是字母 -def isalpha(num:int): - return bytes([num]).isalpha() - -# 是否是字母或下划线 -def isalpha_(num:int): - return bytes([num]).isalpha() or num==TOKEN("_") - -# 是否是数字 -def isdigit(num:int): - return bytes([num]).isdigit() - -# 是否是数字或小数点 -def isdigitdot(num:int): - return bytes([num]).isdigit() or num==TOKEN(".") - -# 是否是空白字符 包括换行符 -def isspace(num:int): - return bytes([num]).isspace() - -# 是否是给定字符串之一 -def isinstr(num:int,t:str): - c=bytes([num]) - return c in t.encode("utf-8") - -# 是否是操作符 -def isoperator(num:int): - return isinstr(num,"<>!+-=") - -@dataclasses.dataclass -class lex_token: - name:str - buff:bytearray - token:int - line:int - pos:int - - -class lex_class(object): - def __init__(self,text:bytes) -> None: - self.text=text - self.index=-1 - self.line=1 - self.pos=-1 - self.token_list:list[lex_token]=[] - self.token_buff=bytearray() - def save_char(self,c:int): - self.token_buff.append(c&0xff) - def save_token(self,token:lex_token): - self.token_list.append(token) - self.token_buff=bytearray() - def _get_char(self): - if(self.index=len(self.text) - def save_one_char_token(self,c:int): - token=lex_token(bytes([c]).decode("utf-8"),bytes([c]),c,self.line,self.pos) - self.save_token(token) - def read_name_and_save(self,c:int): - token=lex_token("symbol",bytearray(),TOKEN_SYMBOL,self.line,self.pos) - self.save_char(c) - while True: - c=self.get_next_char() - if(isalnum_(c)): - self.save_char(c) - else: - break - name=self.token_buff.decode("utf-8") - if(name in _KeyWordTable): - token.token=_KeyWordTable[name] - token.name=name - token.buff=self.token_buff - self.save_token(token) - return c - def read_operator_and_save(self,c:int): - token=lex_token("operator",bytearray(),TOKEN_SYMBOL,self.line,self.pos) - self.save_char(c) - while True: - c=self.get_next_char() - if(isoperator(c)): - self.save_char(c) - else: - break - name=self.token_buff.decode("utf-8") - if(name in _MarkTable): - token.token=_MarkTable[name] - token.name=name - else: - raise Exception(f"不存在的操作符 {name} ") - token.buff=self.token_buff - self.save_token(token) - return c - def read_num_and_save(self,c:int): - token=lex_token("number",bytearray(),TOKEN_NUM,self.line,self.pos) - self.save_char(c) - while True: - c=self.get_next_char() - if(isdigitdot(c)): - self.save_char(c) - else: - break - if(self.token_buff.count(b'.')>1): - raise Exception("数字不能包含多个点号") - token.buff=self.token_buff - self.save_token(token) - return c - def read_str_and_save(self,c:int): - c=self.get_next_char() - while c!=b'\"'[0]: - self.save_char(c) - c=self.get_next_char() - self.save_token(lex_token("string",self.token_buff,TOKEN_STRING,self.line,self.pos)) - return self.get_next_char() - -def lex(text:bytes): - lex_obj = lex_class(text) - c=lex_obj.get_next_char() - while not lex_obj.is_end(): - if isalpha_(c): - c=lex_obj.read_name_and_save(c) - elif isinstr(c,"{}[]()~,;:*"): - lex_obj.save_one_char_token(c) - c=lex_obj.get_next_char() - elif isdigit(c): - c=lex_obj.read_num_and_save(c) - elif isspace(c): - c=lex_obj.get_next_char() - elif isoperator(c): - c=lex_obj.read_operator_and_save(c) - elif isinstr(c,"\""): - c=lex_obj.read_str_and_save(c) - elif isinstr(c,"\\"): - c=lex_obj.get_next_char(c) - if(c!=TOKEN("\r") and c!=TOKEN("\n")): - raise Exception(f"符号 '\\' 必须在行末, line:{lex_obj.line} pos:{lex_obj.pos}") - elif isinstr(c,"/"): - c=lex_obj.get_next_char() - if(c==TOKEN("/")): - while c!=TOKEN("\n"): - c=lex_obj.get_next_char() - elif(c==TOKEN("*")): - c_old=lex_obj.get_next_char() - c=lex_obj.get_next_char() - while not (c_old==TOKEN("*") and c==TOKEN("/")): - c_old=c - c=lex_obj.get_next_char() - c=lex_obj.get_next_char() - elif(c==TOKEN("=")): - lex_obj.save_token(lex_token("/=",b"/=",TOKEN_ASSIG_DIV,lex_obj.line,lex_obj.pos)) - c=lex_obj.get_next_char() - else: - lex_obj.save_one_char_token(TOKEN("/")) - else: - raise Exception(f"未知的字符 {bytes([c])}, line:{lex_obj.line} pos:{lex_obj.pos}") - # for item in lex_obj.token_list: - # print(f"{item}") - return lex_obj.token_list - -if __name__ == "__main__": - with open("main.c",mode='rb') as f: - lex(f.read()) diff --git a/node_declear.py b/node_declear.py deleted file mode 100644 index 5a55cb2..0000000 --- a/node_declear.py +++ /dev/null @@ -1,128 +0,0 @@ -from lex_c import lex_token -import lex_c -from parser_c import node -from parser_c import node_file -from parser_c import node_variable_def -from parser_c import node_struct_decl -from parser_c import node_struct_def -from parser_c import node_union_decl -from parser_c import node_union_def -from parser_c import node_enum_decl -from parser_c import node_enum_def -from parser_c import node_func_decl -from parser_c import node_typedef -from parser_c import node_func_def - -from parser_c import find_sentence -from parser_c import dist_node_type -from parser_c import find_close - - - - - -def dist_node_type_struct(token_list:list[lex_token]): - if(token_list[0].token==lex_c.TOKEN_STRUCT): - if(token_list[1].token==lex_c.TOKEN_SYMBOL): - if(len(token_list)==2): - return node_struct_decl(name=token_list[1].buff.decode("utf-8"),token_list=token_list) - elif(token_list[2].token==lex_c.TOKEN("{")): - if not token_list[-1].token==lex_c.TOKEN("}"): - raise Exception("没有出现预期的符号 '}'") - v_list:list[node_variable_def]=[] - token_list_local=token_list[3:-1] - while len(token_list_local)>0: - sentence=find_sentence(token_list_local) - v_list.append(dist_node_type(token_list=sentence)) - token_list_local=token_list_local[len(sentence):] - return node_struct_def(name=token_list[1].buff.decode("utf-8"),token_list=token_list,body=v_list) - raise Exception(f"语法错误 {token_list[0]}") - - - -def dist_node_type_union(token_list:list[lex_token]): - if(token_list[0].token==lex_c.TOKEN_UNION): - if(token_list[1].token==lex_c.TOKEN_SYMBOL): - if(len(token_list)==2): - return node_union_decl(name=token_list[1].buff.decode("utf-8"),token_list=token_list) - elif(token_list[2].token==lex_c.TOKEN("{")): - if not token_list[-1].token==lex_c.TOKEN("}"): - raise Exception("没有出现预期的符号 '}'") - v_list:list[node_variable_def]=[] - token_list_local=token_list[3:-1] - while len(token_list_local)>0: - sentence=find_sentence(token_list_local) - v_list.append(dist_node_type(token_list=sentence)) - token_list_local=token_list_local[len(sentence):] - return node_union_def(name=token_list[1].buff.decode("utf-8"),token_list=token_list,body=v_list) - raise Exception(f"语法错误 {token_list[0]}") - - - -def dist_node_type_enum(token_list:list[lex_token]): - if(token_list[0].token==lex_c.TOKEN_ENUM): - if(token_list[1].token==lex_c.TOKEN_SYMBOL): - if(len(token_list)==2): - return node_enum_decl(name=token_list[1].buff.decode("utf-8"),token_list=token_list) - elif(token_list[2].token==lex_c.TOKEN("{")): - if not token_list[-1].token==lex_c.TOKEN("}"): - raise Exception("没有出现预期的符号 '}'") - token_list_local=token_list[3:-1] - index=0 - v_list:list[dict]=[] - while len(token_list_local)>0: - if(token_list_local[0].token==lex_c.TOKEN_SYMBOL): - key=token_list_local[0].buff.decode("utf-8") - if(token_list_local[1].token==lex_c.TOKEN("=") and token_list_local[2].token==lex_c.TOKEN_NUM): - index=int(token_list_local[2].buff.decode("utf-8")) - token_list_local=token_list_local[3:] - else: - index+=1 - token_list_local=token_list_local[1:] - v_list.append({key:index}) - if(len(token_list_local)>0): - if(token_list_local[0].token!=lex_c.TOKEN(",")): - raise Exception(f"枚举类型应该使用 ',' 分隔符") - token_list_local=token_list_local[1:] - return node_enum_def(name=token_list[1].buff.decode("utf-8"),token_list=token_list,body=v_list) - raise Exception(f"语法错误 {token_list[0]}") - - -def dist_node_type_typedef(token_list:list[lex_token]): - if(token_list[0].token==lex_c.TOKEN_TYPEDEF): - attr=[] - token_list_local=token_list - if(token_list[-1].token!=lex_c.TOKEN_SYMBOL): - raise Exception(f"没有定义新类型 {token_list[-1]}") - name=token_list[-1].buff.decode("utf-8") - token_list=token_list[1:] - while token_list[0].token in [lex_c.TOKEN_UNSIGNED,lex_c.TOKEN_CONST]: - attr.append(token_list[0].name) - token_list=token_list[1:] - if(token_list[0].token==lex_c.TOKEN_STRUCT or token_list[0].token==lex_c.TOKEN_UNION): - attr.append(token_list[0].name) - if(token_list[1].token==lex_c.TOKEN_SYMBOL): - node_r=None - attr.append(token_list[1].buff.decode("utf-8")) - if(token_list[2].token==lex_c.TOKEN("{")): - node_r=dist_node_type(token_list=token_list[1:-1]) - elif(token_list[2].token==lex_c.TOKEN("*")): - attr.append(token_list[2].name) - return node_typedef(name=name,token_list=token_list_local,attr=attr,body=node_r) - if(token_list[0].token==lex_c.TOKEN_SYMBOL): - # 使用typedef 定义过的自定义类型 - attr.append(token_list[0].buff.decode("utf-8")) - token_list=token_list[1:] - else: - # c语言预设类型 - while(token_list[0].token in - [lex_c.TOKEN_INT,lex_c.TOKEN_CHAR,lex_c.TOKEN_SHORT,lex_c.TOKEN_LONG,lex_c.TOKEN_FLOAT, - lex_c.TOKEN_DOUBLE,lex_c.TOKEN_VOID,lex_c.TOKEN("*")]): - attr.append(token_list[0].name) - token_list=token_list[1:] - if(len(token_list)>1): - raise Exception(f"意外的token {token_list[0]}") - return node_typedef(name=name,token_list=token_list_local,attr=attr,body=None) - raise Exception(f"语法错误 {token_list[0]}") - - diff --git a/parser_c.py b/parser_c.py index f5d4dfd..6b9f327 100644 --- a/parser_c.py +++ b/parser_c.py @@ -1,20 +1,441 @@ import os import sys +import shutil import dataclasses -from lex_c import lex_token -from lex_c import lex -import lex_c +import copy + + + +TOKEN_IF = 256, +TOKEN_BREAK = 257, +TOKEN_WHILE=258, +TOKEN_SWITCH=259, +TOKEN_CASE=260, +TOKEN_DO=261, +TOKEN_CHAR=262, +TOKEN_INT=263, +TOKEN_VOID=264, +TOKEN_SYMBOL = 265 , +TOKEN_NUM = 266 ,# 数字 +TOKEN_INC = 267,# 自增 +TOKEN_DEC = 268,# 自减 +TOKEN_EQ = 269,# 相等 +TOKEN_NEQ = 270,# 不相等 +TOKEN_LSH = 271,# 左移 +TOKEN_RSH = 272,# 右移 +TOKEN_LEQ = 273,# 小于等于 +TOKEN_GEQ = 274,# 大于等于 +TOKEN_ELSE = 275, +TOKEN_CONTINUE = 276 , +TOKEN_CONST = 277 , +TOKEN_STATIC = 278 , +TOKEN_UNSIGNED = 279 , +TOKEN_TYPEDEF = 280 , +TOKEN_STRUCT = 281 , +TOKEN_ENUM = 282 , +TOKEN_UNION = 283, +TOKEN_STRING = 284, +TOKEN_DEFAULT = 285, +TOKEN_RETURN = 286, +TOKEN_ASSIG_ADD = 287, +TOKEN_ASSIG_SUB = 288, +TOKEN_ASSIG_MUL = 289, +TOKEN_ASSIG_DIV = 290, +TOKEN_ASSIG_LSH = 291, +TOKEN_ASSIG_RSH = 292, +TOKEN_EXTERN = 293, +TOKEN_FLOAT = 294, +TOKEN_DOUBLE = 295, +TOKEN_SHORT = 296, +TOKEN_LONG = 297, +TOKEN_POINTER = 298, +TOKEN_LOGICAL_OR = 299,# 逻辑或 +TOKEN_LOGICAL_AND = 300,# 逻辑与 +TOKEN_OMIT = 301,# 省略符 ... + + +def TOKEN(t:str): + return t.encode("utf-8")[0] + +_KeyWordTable={ + "if":TOKEN_IF, + "else":TOKEN_ELSE, + "break":TOKEN_BREAK, + "while":TOKEN_WHILE, + "switch":TOKEN_SWITCH, + "case":TOKEN_CASE, + "do":TOKEN_DO, + "char":TOKEN_CHAR, + "int":TOKEN_INT, + "void":TOKEN_VOID, + "continue":TOKEN_CONTINUE, + "const":TOKEN_CONST, + "static":TOKEN_STATIC, + "unisgned":TOKEN_UNSIGNED, + "typedef":TOKEN_TYPEDEF, + "struct":TOKEN_STRUCT, + "enum":TOKEN_ENUM, + "union":TOKEN_UNION, + "default":TOKEN_DEFAULT, + "return":TOKEN_RETURN, + "extern":TOKEN_EXTERN, + "float":TOKEN_FLOAT, + "double":TOKEN_DOUBLE, + "short":TOKEN_SHORT, + "long":TOKEN_LONG, +} + +_MarkTable={ + "<<":TOKEN_LSH, + ">>":TOKEN_RSH, + "<=":TOKEN_LEQ, + ">=":TOKEN_GEQ, + "!=":TOKEN_NEQ, + "==":TOKEN_EQ, + "++":TOKEN_INC, + "--":TOKEN_DEC, + "->":TOKEN_POINTER, + "+=":TOKEN_ASSIG_ADD, + "-=":TOKEN_ASSIG_SUB, + "*=":TOKEN_ASSIG_MUL, + "<<=":TOKEN_ASSIG_LSH, + ">>=":TOKEN_ASSIG_RSH, + "=":TOKEN("="), + "!":TOKEN("!"), + "<":TOKEN("<"), + ">":TOKEN(">"), + "+":TOKEN("+"), + "-":TOKEN("-"), + +} + + + +# 是否是数字加字母 +def isalnum(num:int): + return bytes([num]).isalnum() + +# 是否是数字加字母或下划线 +def isalnum_(num:int): + return bytes([num]).isalnum() or num==TOKEN("_") + +# 是否是字母 +def isalpha(num:int): + return bytes([num]).isalpha() + +# 是否是字母或下划线 +def isalpha_(num:int): + return bytes([num]).isalpha() or num==TOKEN("_") + +# 是否是数字 +def isdigit(num:int): + return bytes([num]).isdigit() + +# 是否是数字或小数点 +def isdigitdot(num:int): + return bytes([num]).isdigit() or num==TOKEN(".") + +# 是否是空白字符 包括换行符 +def isspace(num:int): + return bytes([num]).isspace() + +# 是否是给定字符串之一 +def isinstr(num:int,t:str): + c=bytes([num]) + return c in t.encode("utf-8") + +# 是否是操作符 +def isoperator(num:int): + return isinstr(num,"<>!+-=") + +@dataclasses.dataclass +class lex_token: + name:str + buff:bytearray + token:int + line:int + pos:int + +# 连写的操作符,这些实际上是多个操作符写在一起的结果 +_NotMarkTable={ + "!!":[lex_token("operator",'!',TOKEN('!'),0,0), + lex_token("operator",'!',TOKEN('!'),0,0)], + "=-":[lex_token("operator",'=',TOKEN('='),0,0), + lex_token("operator",'-',TOKEN('-'),0,0)], + "--=":[lex_token("operator",'--',TOKEN_DEC,0,0), + lex_token("operator",'=',TOKEN('='),0,0)], + "++=":[lex_token("operator",'++',TOKEN_INC,0,0), + lex_token("operator",'=',TOKEN('='),0,0)], + "=--":[lex_token("operator",'=',TOKEN('='),0,0), + lex_token("operator",'--',TOKEN_DEC,0,0)], + "=++":[lex_token("operator",'=',TOKEN('='),0,0), + lex_token("operator",'++',TOKEN_INC,0,0)], + "!=--":[lex_token("operator",'!=',TOKEN_NEQ,0,0), + lex_token("operator",'--',TOKEN_DEC,0,0)], + "!=++":[lex_token("operator",'!=',TOKEN_NEQ,0,0), + lex_token("operator",'++',TOKEN_INC,0,0)], + "==--":[lex_token("operator",'==',TOKEN_EQ,0,0), + lex_token("operator",'--',TOKEN_DEC,0,0)], + "==++":[lex_token("operator",'==',TOKEN_EQ,0,0), + lex_token("operator",'++',TOKEN_INC,0,0)], +} + + + +class lex_class(object): + def __init__(self,text:bytes,file_name:str="") -> None: + self.text=text + self.index=-1 + self.line=1 + self.pos=-1 + self.token_list:list[lex_token]=[] + self.token_buff=bytearray() + self.file_name=file_name + self.macro_table={} + def save_char(self,c:int): + self.token_buff.append(c&0xff) + def save_token(self,token:lex_token): + self.token_list.append(token) + self.token_buff=bytearray() + def _get_char(self): + if(self.index=len(self.text) + def save_one_char_token(self,c:int): + token=lex_token(bytes([c]).decode("utf-8"),bytes([c]),c,self.line,self.pos) + self.save_token(token) + def read_name_and_save(self,c:int): + token=lex_token("symbol",bytearray(),TOKEN_SYMBOL,self.line,self.pos) + self.save_char(c) + while True: + c=self.get_next_char() + if(isalnum_(c)): + self.save_char(c) + else: + break + name=self.token_buff.decode("utf-8") + if(name in _KeyWordTable): + token.token=_KeyWordTable[name] + token.name=name + token.buff=self.token_buff + self.save_token(token) + return c + def read_operator_and_save(self,c:int): + token=lex_token("operator",bytearray(),TOKEN_SYMBOL,self.line,self.pos) + self.save_char(c) + while True: + c=self.get_next_char() + if(isoperator(c)): + self.save_char(c) + else: + break + name=self.token_buff.decode("utf-8") + if(name in _MarkTable): + token.token=_MarkTable[name] + token.name=name + token.buff=self.token_buff + self.save_token(token) + elif(name in _NotMarkTable): + tokens=_NotMarkTable[name] + for t in tokens: + token.token=t.token + token.name=t.name + token.buff=token.name.encode("utf-8") + self.save_token(copy.deepcopy(token)) + token.pos+=len(token.name) + else: + raise Exception(f"不存在的操作符 {name} {self.file_name}:{self.line},{self.pos}") + # print(f"不存在的操作符 {name} ") + return c + def read_num_and_save(self,c:int): + token=lex_token("number",bytearray(),TOKEN_NUM,self.line,self.pos) + self.save_char(c) + while True: + c=self.get_next_char() + if(isdigitdot(c)): + self.save_char(c) + else: + break + if(self.token_buff.count(b'.')>1): + raise Exception(f"数字不能包含多个点号 {self.file_name}:{self.line},{self.pos}") + token.buff=self.token_buff + self.save_token(token) + return c + _escape_table={'0':0,'a':7,'b':8,'t':9,'n':10,'v':11,'f':12,'r':13,'"':34,'\'':39,'?':63,'\\':92} + def read_str_and_save(self,c:int): + c=self.get_next_char() + while c!=b'\"'[0]: + if(c==TOKEN('\\')):# \ + c=self.get_next_char() + self.save_char(self._escape_table.get(c,0)) + else: + self.save_char(c) + c=self.get_next_char() + self.save_token(lex_token("string",self.token_buff,TOKEN_STRING,self.line,self.pos)) + return self.get_next_char() + def read_char_and_save(self,c:int): + c=self.get_next_char() + while c!=b'\''[0]: + if(c==TOKEN('\\')):# \ + c=self.get_next_char() + self.save_char(self._escape_table.get(c,0)) + else: + self.save_char(c) + c=self.get_next_char() + self.save_token(lex_token("string",self.token_buff,TOKEN_STRING,self.line,self.pos)) + return self.get_next_char() + def deal_macro(self,buff:bytearray): + self.macro_result=False + sp=buff.decode('utf-8').split() + if(len(sp)>0): + if(sp[0]=='#define'): + if(len(sp)>=3): + if not (sp[1] in self.macro_table): + self.macro_table[sp[1]]=' '.join(sp[2:]) + else: + if not (sp[1] in self.macro_table): + self.macro_table[sp[1]]="" + elif(sp[0]=='#ifdef'): + self.macro_result= (sp[1] in self.macro_table) + return self.macro_result + elif(sp[0]=='#if'): + t=' '.join(sp[1:])# 判断条件比较复杂,暂时固定返回失败 + return self.macro_result + elif(sp[0]=='#elif'): + return self.macro_result + elif(sp[0]=='#else'): + self.macro_result= not self.macro_result + return self.macro_result + elif(sp[0]=='#endif'): + return True + else: + return True + +def lex(text:bytes,file_name:str=""): + lex_obj = lex_class(text,file_name) + c=lex_obj.get_next_char() + line_old=0 + pos_old=0 + while not lex_obj.is_end(): + line_old=lex_obj.line + pos_old=lex_obj.pos + if isalpha_(c): + c=lex_obj.read_name_and_save(c) + elif isinstr(c,"{}[]()~,;:*?%^"): + lex_obj.save_one_char_token(c) + c=lex_obj.get_next_char() + elif isdigit(c): + c=lex_obj.read_num_and_save(c) + elif isspace(c): + c=lex_obj.get_next_char() + elif isoperator(c): + c=lex_obj.read_operator_and_save(c) + elif isinstr(c,"\""): + c=lex_obj.read_str_and_save(c) + elif isinstr(c,"\'"): + c=lex_obj.read_char_and_save(c) + elif isinstr(c,"\\"): + c=lex_obj.get_next_char() + if(c!=TOKEN("\r") and c!=TOKEN("\n")): + raise Exception(f"符号 '\\' 必须在行末, {lex_obj.file_name}:{lex_obj.line},{lex_obj.pos}") + elif isinstr(c,"#"): # 宏定义 + c_old=c + buff=bytearray() + while (c!=TOKEN("\n") and c!=-1): + c=lex_obj.get_next_char() + if(c_old==TOKEN('/') and c==TOKEN('*')):# 适配宏后面有注释的情况 + while not (c_old==TOKEN("*") and c==TOKEN("/")) or c==-1: + c_old=c + c=lex_obj.get_next_char() + elif(c_old==TOKEN('/') and c==TOKEN('/')): + while not (c==TOKEN('\n') or c==-1): + c=lex_obj.get_next_char() + elif(c_old==TOKEN('\\') and c in [TOKEN('\n'),TOKEN('\r')]):# 适配多行 + c=lex_obj.get_next_char() + else: + buff.append(c_old&0xff) + c_old=c + if not (lex_obj.deal_macro(buff)): # 处理宏 + is_space=True + while True: + c=lex_obj.get_next_char() + if(is_space and c==TOKEN('#')): + break + if(c==-1): + break + if not isspace(c): + is_space=False + elif(c==TOKEN('\n')): + is_space=True + elif isinstr(c,"/"): + c=lex_obj.get_next_char() + if(c==TOKEN("/")): + while (c!=TOKEN("\n") and c!=-1): + c=lex_obj.get_next_char() + elif(c==TOKEN("*")): + c_old=lex_obj.get_next_char() + c=lex_obj.get_next_char() + while not (c_old==TOKEN("*") and c==TOKEN("/")): + c_old=c + c=lex_obj.get_next_char() + c=lex_obj.get_next_char() + elif(c==TOKEN("=")): + lex_obj.save_token(lex_token("/=",b"/=",TOKEN_ASSIG_DIV,lex_obj.line,lex_obj.pos)) + c=lex_obj.get_next_char() + else: + lex_obj.save_one_char_token(TOKEN("/")) + elif isinstr(c,"|"): + c=lex_obj.get_next_char() + if(c==TOKEN("|")): + lex_obj.save_token(lex_token("||",b"||",TOKEN_LOGICAL_OR,lex_obj.line,lex_obj.pos)) + else: + lex_obj.save_one_char_token(TOKEN("|")) + elif isinstr(c,"&"): + c=lex_obj.get_next_char() + if(c==TOKEN("&")): + lex_obj.save_token(lex_token("&&",b"&&",TOKEN_LOGICAL_AND,lex_obj.line,lex_obj.pos)) + else: + lex_obj.save_one_char_token(TOKEN("&")) + elif isinstr(c,'.'): + c=lex_obj.get_next_char() + if(c==TOKEN('.')): + c=lex_obj.get_next_char() + if(c==TOKEN('.')): + lex_obj.save_token(lex_token("...",b"...",TOKEN_OMIT,lex_obj.line,lex_obj.pos)) + else: + raise Exception (f"格式错误 {bytes([c])}, {lex_obj.file_name}:{lex_obj.line},{lex_obj.pos}") + else: + lex_obj.save_one_char_token(TOKEN(".")) + else: + raise Exception(f"未知的字符 {bytes([c])}, {lex_obj.file_name}:{lex_obj.line},{lex_obj.pos}") + # c=lex_obj.get_next_char() + # if(line_old==lex_obj.line and pos_old==lex_obj.pos): + # print(f"pointer not move.") + # print(line_old,pos_old) + # for item in lex_obj.token_list: + # print(f"{item}") + return lex_obj.token_list -_NodeTypeTable=[ - "file","vdecl","fdef" -] @dataclasses.dataclass class node: - name:list[str]=dataclasses.field(default_factory=list) + name:list=dataclasses.field(default_factory=list) type:str="base" - token_list:list[lex_token]=dataclasses.field(default_factory=list) + token_list:list=dataclasses.field(default_factory=list) child:list=dataclasses.field(default_factory=list) def complite(self): print(f"complite {self.type}") @@ -125,7 +546,7 @@ class node_int(node): # 找到闭合的括号 -def find_close(token_list:list[lex_token],token:tuple[int,int]): +def find_close(token_list:list,token:tuple): if token_list[0].token!=token[0]: return 0 num=0 @@ -136,20 +557,33 @@ def find_close(token_list:list[lex_token],token:tuple[int,int]): num-=1 if(num==0): return index - raise Exception(f"没有找到闭合的符号 {token[1]}") + raise Exception(f"没有找到闭合的符号 {token_list[0]}") + +# 找到指定token的index +def find_token(token_list:list,token:int): + num=0 + for index,item in enumerate(token_list): + if(item.token!=token): + num+=1 + else: + return num + return num + # 找到一个完整的语句 -def find_sentence(token_list:list[lex_token],sep:list[int]=[lex_c.TOKEN(";"),lex_c.TOKEN(":")]): +def find_sentence(token_list:list,sep:list=[TOKEN(";"),TOKEN(":")]): bracket_flag=False index=0 + if(len(token_list)==1): + return token_list while index0): bracket_flag=True index+=bracket_index - elif(token_list[index].token==lex_c.TOKEN("{")): - bracket_index=find_close(token_list[index:],(lex_c.TOKEN("{"),lex_c.TOKEN("}"))) + elif(token_list[index].token==TOKEN("{")): + bracket_index=find_close(token_list[index:],(TOKEN("{"),TOKEN("}"))) if(bracket_index>0): index+=bracket_index if(bracket_flag==True): @@ -157,7 +591,7 @@ def find_sentence(token_list:list[lex_token],sep:list[int]=[lex_c.TOKEN(";"),lex elif(token_list[index].token in sep): return token_list[:index+1] index+=1 - raise Exception(f"没有找到完整的语句") + raise Exception(f"没有找到完整的语句 sep={sep} token={token_list[0]}") @@ -171,140 +605,162 @@ def find_sentence(token_list:list[lex_token],sep:list[int]=[lex_c.TOKEN(";"),lex -def dist_node_type_struct(token_list:list[lex_token]): - if(token_list[0].token==lex_c.TOKEN_STRUCT): - if(token_list[1].token==lex_c.TOKEN_SYMBOL): +def dist_node_type_struct(token_list:list): + if(token_list[0].token==TOKEN_STRUCT): + if(token_list[1].token==TOKEN_SYMBOL): if(len(token_list)==2): - return node_struct_decl(name=token_list[1].buff.decode("utf-8"),token_list=token_list) - elif(token_list[2].token==lex_c.TOKEN("{")): - if not token_list[-1].token==lex_c.TOKEN("}"): - raise Exception("没有出现预期的符号 '}'") - v_list:list[node_variable_def]=[] - token_list_local=token_list[3:-1] - while len(token_list_local)>0: - sentence=find_sentence(token_list_local) - v_list.append(dist_node_type(token_list=sentence)) - token_list_local=token_list_local[len(sentence):] - return node_struct_def(name=token_list[1].buff.decode("utf-8"),token_list=token_list,child=v_list) + return node_struct_decl(name=token_list[1].buff.decode("utf-8"),token_list=token_list,child=[]) + elif(token_list[2].token==TOKEN("{")): + # if not token_list[-1].token==TOKEN("}"): + # raise Exception("没有出现预期的符号 '}'") + # v_list:list[node_variable_def]=[] + # token_list_local=token_list[3:-1] + # while len(token_list_local)>0: + # sentence=find_sentence(token_list_local) + # v_list.append(dist_node_type(token_list=sentence)) + # token_list_local=token_list_local[len(sentence):] + return node_struct_def(name=token_list[1].buff.decode("utf-8"),token_list=token_list,child=[]) + else: + return node_struct_decl(name=token_list[1].buff.decode("utf-8"),token_list=token_list,child=[]) + if(find_token(token_list,TOKEN('('))0: - sentence=find_sentence(token_list_local) - v_list.append(dist_node_type(token_list=sentence)) - token_list_local=token_list_local[len(sentence):] - return node_union_def(name=token_list[1].buff.decode("utf-8"),token_list=token_list,child=v_list) + elif(token_list[2].token==TOKEN("{")): + # if not token_list[-1].token==TOKEN("}"): + # raise Exception("没有出现预期的符号 '}'") + # v_list:list[node_variable_def]=[] + # token_list_local=token_list[3:-1] + # while len(token_list_local)>0: + # sentence=find_sentence(token_list_local) + # v_list.append(dist_node_type(token_list=sentence)) + # token_list_local=token_list_local[len(sentence):] + return node_union_def(name=token_list[1].buff.decode("utf-8"),token_list=token_list,child=[]) + if(find_token(token_list,TOKEN('('))1): - raise Exception(f"意外的token {token_list[0]}") + # if(len(token_list)>1): + # raise Exception(f"意外的token {token_list[0]}") return node_typedef(name=name,token_list=token_list_local,child=[]) raise Exception(f"语法错误 {token_list[0]}") # 找到子节点 -def find_child(token_list:list[lex_token],seq:list[int]=[lex_c.TOKEN(";"),lex_c.TOKEN(":")]): +def find_child(token_list:list,seq:list=[TOKEN(";"),TOKEN(":")]): child=[] + token_list_local=[] for i in range(len(token_list)): - if(token_list[i].token==lex_c.TOKEN("{")): + if(token_list[i].token==TOKEN("{")): token_list_local=token_list[i+1:-1] break while len(token_list_local)>0: sentence=find_sentence(token_list_local,seq) node_d=dist_node_type(sentence) - child.append(node_d) + if not node_d is None: + child.append(node_d) token_list_local=token_list_local[len(sentence):] return child -def dist_node_type_funcdef(token_list:list[lex_token]): +def dist_node_type_funcdef(token_list:list): for i in range(len(token_list)): - if(token_list[i].token==lex_c.TOKEN_SYMBOL): - name=token_list[i].buff.decode("utf-8") + if(token_list[i].token==TOKEN('(')): + name=token_list[i-1].buff.decode("utf-8") break - return node_func_def(name=[name],token_list=token_list,child=find_child(token_list)) + # return node_func_def(name=[name],token_list=token_list,child=find_child(token_list)) + return node_func_def(name=[name],token_list=token_list,child=[]) -def dist_node_type_funcdecl(token_list:list[lex_token]): +def dist_node_type_funcdecl(token_list:list): for i in range(len(token_list)): - if(token_list[i].token==lex_c.TOKEN_SYMBOL): + if(token_list[i].token==TOKEN_SYMBOL): name=token_list[i].buff.decode("utf-8") return node_func_decl(name=[name],token_list=token_list,child=[]) raise Exception(f"函数声明格式错误 {token_list[0]}") # 第一个token是symbol的处理 -def dist_node_type_symbol(token_list:list[lex_token]): +def dist_node_type_symbol(token_list:list): # 变量赋值或函数调用 if(len(token_list)==1): return node_symbol(name=token_list[0].buff.decode("utf-8"),token_list=token_list) - if(token_list[1].token == lex_c.TOKEN("(")): + if(token_list[1].token == TOKEN("(")): child=find_child(token_list=token_list[2:-1]) return node_call("call",token_list=token_list,child=child) elif(token_list[1].token in [ - lex_c.TOKEN("="),lex_c.TOKEN_ASSIG_ADD,lex_c.TOKEN_ASSIG_DIV,lex_c.TOKEN_ASSIG_LSH, - lex_c.TOKEN_ASSIG_MUL,lex_c.TOKEN_ASSIG_RSH,lex_c.TOKEN_ASSIG_SUB]): + TOKEN("="),TOKEN_ASSIG_ADD,TOKEN_ASSIG_DIV,TOKEN_ASSIG_LSH, + TOKEN_ASSIG_MUL,TOKEN_ASSIG_RSH,TOKEN_ASSIG_SUB]): name=token_list[1].name - child=[node_symbol(name=token_list[0].buff.decode("utf-8"),token_list=token_list[:1]), - dist_node_type(token_list=token_list[2:])] + child=[node_symbol(name=token_list[0].buff.decode("utf-8"),token_list=token_list[:1]),] + child_d=dist_node_type(token_list=token_list[2:]) + if not child_d is None: + child.append(child_d) return node_opt(name=name,token_list=token_list,child=child) else: # 没有赋值属性的操作 @@ -325,80 +781,181 @@ def dist_node_type_symbol(token_list:list[lex_token]): # 判断一个语句的类型 -def dist_node_type(token_list:list[lex_token]): - if(token_list[0].token==lex_c.TOKEN_EXTERN): +def dist_node_type(token_list:list): + # print(f"{token_list[0]}") + if(token_list[0].token==TOKEN_EXTERN): token_list=token_list[1:] - if(token_list[-1].token==lex_c.TOKEN(";")): + if(token_list[-1].token==TOKEN(";")): token_list=token_list[:-1] - if(token_list[0].token==lex_c.TOKEN_STRUCT): + if(len(token_list)==0): + return None + if(token_list[0].token==TOKEN_STRUCT): return dist_node_type_struct(token_list=token_list) - if(token_list[0].token==lex_c.TOKEN_UNION): + if(token_list[0].token==TOKEN_UNION): return dist_node_type_union(token_list=token_list) - if(token_list[0].token==lex_c.TOKEN_ENUM): + if(token_list[0].token==TOKEN_ENUM): return dist_node_type_enum(token_list=token_list) - if(token_list[0].token==lex_c.TOKEN_TYPEDEF): + if(token_list[0].token==TOKEN_TYPEDEF): return dist_node_type_typedef(token_list=token_list) - if(token_list[0].token==lex_c.TOKEN_SWITCH): - child=find_child(token_list) - return node_switch(name="",token_list=token_list,child=child) - if(token_list[0].token==lex_c.TOKEN_CASE): - name=token_list[1].buff.decode("utf-8") - return node_case(name=name,token_list=token_list,child=[]) - if(token_list[0].token==lex_c.TOKEN_DEFAULT): - return node_default(name="",token_list=token_list,child=[]) - if(token_list[0].token==lex_c.TOKEN_BREAK): - return node_break(name="",token_list=token_list,child=[]) - if(token_list[0].token==lex_c.TOKEN_RETURN): - if(len(token_list)>1): - child=[dist_node_type(token_list[1:])] - else: - child=[] - return node_return(name="",token_list=token_list,child=child) - if(token_list[0].token==lex_c.TOKEN_STRING): + # if(token_list[0].token==TOKEN_SWITCH): + # child=find_child(token_list) + # return node_switch(name="",token_list=token_list,child=child) + # if(token_list[0].token==TOKEN_CASE): + # name=token_list[1].buff.decode("utf-8") + # return node_case(name=name,token_list=token_list,child=[]) + # if(token_list[0].token==TOKEN_DEFAULT): + # return node_default(name="",token_list=token_list,child=[]) + # if(token_list[0].token==TOKEN_BREAK): + # return node_break(name="",token_list=token_list,child=[]) + # if(token_list[0].token==TOKEN_RETURN): + # if(len(token_list)>1): + # child=[dist_node_type(token_list[1:])] + # else: + # child=[] + # return node_return(name="",token_list=token_list,child=child) + if(token_list[0].token==TOKEN_STRING): name=token_list[0].buff.decode("utf-8") return node_string(name=name,token_list=token_list,child=[]) - if(token_list[0].token==lex_c.TOKEN_NUM): + if(token_list[0].token==TOKEN_NUM): name=token_list[0].buff.decode("utf-8") return node_int(name=name,token_list=token_list,child=[]) - if(token_list[-1].token==lex_c.TOKEN(")")): + if(token_list[-1].token==TOKEN(")")): # 函数声明 return dist_node_type_funcdecl(token_list) - elif(token_list[-1].token==lex_c.TOKEN("}")): - # 函数定义 - return dist_node_type_funcdef(token_list=token_list) - elif(token_list[0].token==lex_c.TOKEN_SYMBOL): + elif(token_list[-1].token==TOKEN("}")): + if(find_token(token_list,TOKEN('('))0: for item in n.child: print_node(item,deep+1) +def find_func_def_in_file(n:node,deep:int,func_name_list:list): + ack=False + if(n.type=='func_def') and (n.name[0] in func_name_list): + print(f"{n.type} {n.name}") + return True + # n.complite() + if (not n.child is None) and len(n.child)>0: + for item in n.child: + ack=find_func_def_in_file(item,deep+1,func_name_list) + if(ack): + return ack + return False -if __name__ == "__main__": - file_name="main.c" +def check_func_def(file_name:str,func_name_list:list): with open(file_name,mode='rb') as f: - token_list=lex(f.read()) + read_d=f.read() + if(read_d[:3]==bytes([0xef,0xbb,0xbf])): + read_d=read_d[3:] + token_list=lex(read_d,file_name) file=node_file(name=file_name,token_list=token_list) while len(token_list)>0: - sentence=find_sentence(token_list) - node_d=dist_node_type(sentence) - file.child.append(node_d) - # print('找到一个语句:') - # for item in sentence: - # print(f"\t{item}") + node_d=None + try: + sentence=find_sentence(token_list) + node_d=dist_node_type(sentence) + except Exception as e: + print(f"in {file_name}") + print(f"\t {e}") + break + if not node_d is None: + file.child.append(node_d) token_list=token_list[len(sentence):] - print_node(file,0) \ No newline at end of file + print_node(file,0) + return find_func_def_in_file(file,0,func_name_list) + + +# 找到定义函数的文件 +def find_func_def(file_list:list,func_name_list:str): + ret_list=[] + err_list=[] + for item in file_list: + sys.stdout.write('.') + sys.stdout.flush() + # try: + ack=check_func_def(item,func_name_list) + if(ack): + ret_list.append(item) + # except Exception as e: + # print(e) + # err_list.append(item) + return ret_list,err_list + +# 找到指定后缀的文件 +def find_type(path:str,fix:str): + dlist=os.listdir(path) + file_list=[] + for i in dlist: + ps=os.path.join(path, i) + if os.path.isdir(ps): + file_list+=find_type(ps,fix) + pass + else: + if(ps[-len(fix):]==fix): + file_list.append(ps) + return file_list + + + + +# with open("build/build_log.log",mode="r",encoding="utf-8") as f: +# _out_text=f.readlines() + + +def get_func_list(): + func_list=[] + _out_text=sys.stdin.readlines() + for item in _out_text: + key_str='undefined reference to `' + index=item.find(key_str) + if(index<0): + continue + index+=len(key_str) + index_end=item[index:].find('\'') + func=item[index:index+index_end] + if not (func in func_list): + func_list.append(func) + return func_list + + + + +# 参数是扫描的目录列表 +if __name__=="__main__": + file_list=[] + for item in sys.argv[1:]: + file_list+=find_type(item,'.c') + # file_list=["./dtest/dtest3/kl3_core_mark/core_main.c"] + print(f"there is {len(file_list)} .c file.") + # func_list=get_func_list() + func_list=['main'] + print(func_list) + # find_func_def(['driver/src/hw3/efuse.c'],['efuse_get_d_bg_vbg_cntl']) + ret_list,err_list=find_func_def(file_list,func_list) + print("已找到的文件") + for item in ret_list: + print(item) + print("分析失败的文件") + for item in err_list: + print(item) \ No newline at end of file