import os import sys import dataclasses TOKEN_IF = 256, TOKEN_BREAK = 257, TOKEN_WHILE=258, TOKEN_SWITCH=259, TOKEN_CASE=260, TOKEN_DO=261, TOKEN_CHAR=262, TOKEN_INT=263, TOKEN_VOID=264, TOKEN_SYMBOL = 265 , TOKEN_NUM = 266 ,# 数字 TOKEN_INC = 267,# 自增 TOKEN_DEC = 268,# 自减 TOKEN_EQ = 269,# 相等 TOKEN_NEQ = 270,# 不相等 TOKEN_LSH = 271,# 左移 TOKEN_RSH = 272,# 右移 TOKEN_LEQ = 273,# 小于等于 TOKEN_GEQ = 274,# 大于等于 TOKEN_ELSE = 275, TOKEN_CONTINUE = 276 , TOKEN_CONST = 277 , TOKEN_STATIC = 278 , TOKEN_UNSIGNED = 279 , TOKEN_TYPEDEF = 280 , TOKEN_STRUCT = 281 , TOKEN_ENUM = 282 , TOKEN_UNION = 283, TOKEN_STRING = 284, TOKEN_DEFAULT = 285, TOKEN_RETURN = 286, TOKEN_ASSIG_ADD = 287 TOKEN_ASSIG_SUB = 288 TOKEN_ASSIG_MUL = 289 TOKEN_ASSIG_DIV = 290 TOKEN_ASSIG_LSH = 291 TOKEN_ASSIG_RSH = 292 TOKEN_EXTERN = 293 TOKEN_FLOAT = 294 TOKEN_DOUBLE = 295 TOKEN_SHORT = 296 TOKEN_LONG = 297 def TOKEN(t:str): return t.encode("utf-8")[0] _KeyWordTable={ "if":TOKEN_IF, "else":TOKEN_ELSE, "break":TOKEN_BREAK, "while":TOKEN_WHILE, "switch":TOKEN_SWITCH, "case":TOKEN_CASE, "do":TOKEN_DO, "char":TOKEN_CHAR, "int":TOKEN_INT, "void":TOKEN_VOID, "continue":TOKEN_CONTINUE, "const":TOKEN_CONST, "static":TOKEN_STATIC, "unisgned":TOKEN_UNSIGNED, "typedef":TOKEN_TYPEDEF, "struct":TOKEN_STRUCT, "enum":TOKEN_ENUM, "union":TOKEN_UNION, "default":TOKEN_DEFAULT, "return":TOKEN_RETURN, "extern":TOKEN_EXTERN, "float":TOKEN_FLOAT, "double":TOKEN_DOUBLE, "short":TOKEN_SHORT, "long":TOKEN_LONG, } _MarkTable={ "<<":TOKEN_LSH, ">>":TOKEN_RSH, "<=":TOKEN_LEQ, ">=":TOKEN_GEQ, "!=":TOKEN_NEQ, "==":TOKEN_EQ, "++":TOKEN_INC, "--":TOKEN_DEC, "+=":TOKEN_ASSIG_ADD, "-=":TOKEN_ASSIG_SUB, "*=":TOKEN_ASSIG_MUL, "<<=":TOKEN_ASSIG_LSH, ">>=":TOKEN_ASSIG_RSH, "=":TOKEN("="), "!":TOKEN("!"), "<":TOKEN("<"), ">":TOKEN(">"), "+":TOKEN("+"), "-":TOKEN("-"), } # 是否是数字加字母 def isalnum(num:int): return bytes([num]).isalnum() # 是否是数字加字母或下划线 def isalnum_(num:int): return bytes([num]).isalnum() or num==TOKEN("_") # 是否是字母 def isalpha(num:int): return bytes([num]).isalpha() # 是否是字母或下划线 def isalpha_(num:int): return bytes([num]).isalpha() or num==TOKEN("_") # 是否是数字 def isdigit(num:int): return bytes([num]).isdigit() # 是否是数字或小数点 def isdigitdot(num:int): return bytes([num]).isdigit() or num==TOKEN(".") # 是否是空白字符 包括换行符 def isspace(num:int): return bytes([num]).isspace() # 是否是给定字符串之一 def isinstr(num:int,t:str): c=bytes([num]) return c in t.encode("utf-8") # 是否是操作符 def isoperator(num:int): return isinstr(num,"<>!+-=") @dataclasses.dataclass class lex_token: name:str buff:bytearray token:int line:int pos:int class lex_class(object): def __init__(self,text:bytes) -> None: self.text=text self.index=-1 self.line=1 self.pos=-1 self.token_list:list[lex_token]=[] self.token_buff=bytearray() def save_char(self,c:int): self.token_buff.append(c&0xff) def save_token(self,token:lex_token): self.token_list.append(token) self.token_buff=bytearray() def _get_char(self): if(self.index=len(self.text) def save_one_char_token(self,c:int): token=lex_token(bytes([c]).decode("utf-8"),bytes([c]),c,self.line,self.pos) self.save_token(token) def read_name_and_save(self,c:int): token=lex_token("symbol",bytearray(),TOKEN_SYMBOL,self.line,self.pos) self.save_char(c) while True: c=self.get_next_char() if(isalnum_(c)): self.save_char(c) else: break name=self.token_buff.decode("utf-8") if(name in _KeyWordTable): token.token=_KeyWordTable[name] token.name=name token.buff=self.token_buff self.save_token(token) return c def read_operator_and_save(self,c:int): token=lex_token("operator",bytearray(),TOKEN_SYMBOL,self.line,self.pos) self.save_char(c) while True: c=self.get_next_char() if(isoperator(c)): self.save_char(c) else: break name=self.token_buff.decode("utf-8") if(name in _MarkTable): token.token=_MarkTable[name] token.name=name else: raise Exception(f"不存在的操作符 {name} ") token.buff=self.token_buff self.save_token(token) return c def read_num_and_save(self,c:int): token=lex_token("number",bytearray(),TOKEN_NUM,self.line,self.pos) self.save_char(c) while True: c=self.get_next_char() if(isdigitdot(c)): self.save_char(c) else: break if(self.token_buff.count(b'.')>1): raise Exception("数字不能包含多个点号") token.buff=self.token_buff self.save_token(token) return c def read_str_and_save(self,c:int): c=self.get_next_char() while c!=b'\"'[0]: self.save_char(c) c=self.get_next_char() self.save_token(lex_token("string",self.token_buff,TOKEN_STRING,self.line,self.pos)) return self.get_next_char() def lex(text:bytes): lex_obj = lex_class(text) c=lex_obj.get_next_char() while not lex_obj.is_end(): if isalpha_(c): c=lex_obj.read_name_and_save(c) elif isinstr(c,"{}[]()~,;:*"): lex_obj.save_one_char_token(c) c=lex_obj.get_next_char() elif isdigit(c): c=lex_obj.read_num_and_save(c) elif isspace(c): c=lex_obj.get_next_char() elif isoperator(c): c=lex_obj.read_operator_and_save(c) elif isinstr(c,"\""): c=lex_obj.read_str_and_save(c) elif isinstr(c,"\\"): c=lex_obj.get_next_char(c) if(c!=TOKEN("\r") and c!=TOKEN("\n")): raise Exception(f"符号 '\\' 必须在行末, line:{lex_obj.line} pos:{lex_obj.pos}") elif isinstr(c,"/"): c=lex_obj.get_next_char() if(c==TOKEN("/")): while c!=TOKEN("\n"): c=lex_obj.get_next_char() elif(c==TOKEN("*")): c_old=lex_obj.get_next_char() c=lex_obj.get_next_char() while not (c_old==TOKEN("*") and c==TOKEN("/")): c_old=c c=lex_obj.get_next_char() c=lex_obj.get_next_char() elif(c==TOKEN("=")): lex_obj.save_token(lex_token("/=",b"/=",TOKEN_ASSIG_DIV,lex_obj.line,lex_obj.pos)) c=lex_obj.get_next_char() else: lex_obj.save_one_char_token(TOKEN("/")) else: raise Exception(f"未知的字符 {bytes([c])}, line:{lex_obj.line} pos:{lex_obj.pos}") # for item in lex_obj.token_list: # print(f"{item}") return lex_obj.token_list if __name__ == "__main__": with open("main.c",mode='rb') as f: lex(f.read())