245 lines
5.6 KiB
Python
245 lines
5.6 KiB
Python
import os
|
|
import sys
|
|
import dataclasses
|
|
|
|
|
|
TOKEN_IF = 256,
|
|
TOKEN_BREAK = 257,
|
|
TOKEN_WHILE=258,
|
|
TOKEN_SWITCH=259,
|
|
TOKEN_CASE=260,
|
|
TOKEN_DO=261,
|
|
TOKEN_CHAR=262,
|
|
TOKEN_INT=263,
|
|
TOKEN_VOID=264,
|
|
TOKEN_SYMBOL = 265 ,
|
|
TOKEN_NUM = 266 ,# 数字
|
|
TOKEN_INC = 267,# 自增
|
|
TOKEN_DEC = 268,# 自减
|
|
TOKEN_EQ = 269,# 相等
|
|
TOKEN_NEQ = 270,# 不相等
|
|
TOKEN_LSH = 271,# 左移
|
|
TOKEN_RSH = 272,# 右移
|
|
TOKEN_LEQ = 273,# 小于等于
|
|
TOKEN_GEQ = 274,# 大于等于
|
|
TOKEN_ELSE = 275,
|
|
TOKEN_CONTINUE = 276 ,
|
|
TOKEN_CONST = 277 ,
|
|
TOKEN_STATIC = 278 ,
|
|
TOKEN_UNSIGNED = 279 ,
|
|
TOKEN_TYPEDEF = 280 ,
|
|
TOKEN_STRUCT = 281 ,
|
|
TOKEN_ENUM = 282 ,
|
|
TOKEN_UNION = 283,
|
|
TOKEN_STRING = 284,
|
|
TOKEN_DEFAULT = 285,
|
|
TOKEN_RETURN = 286,
|
|
|
|
def TOKEN(t:str):
|
|
return t.encode("utf-8")[0]
|
|
|
|
_KeyWordTable={
|
|
"if":TOKEN_IF,
|
|
"else":TOKEN_ELSE,
|
|
"break":TOKEN_BREAK,
|
|
"while":TOKEN_WHILE,
|
|
"switch":TOKEN_SWITCH,
|
|
"case":TOKEN_CASE,
|
|
"do":TOKEN_DO,
|
|
"char":TOKEN_CHAR,
|
|
"int":TOKEN_INT,
|
|
"void":TOKEN_VOID,
|
|
"continue":TOKEN_CONTINUE,
|
|
"const":TOKEN_CONST,
|
|
"static":TOKEN_STATIC,
|
|
"unisgned":TOKEN_UNSIGNED,
|
|
"typedef":TOKEN_TYPEDEF,
|
|
"struct":TOKEN_STRUCT,
|
|
"enum":TOKEN_ENUM,
|
|
"union":TOKEN_UNION,
|
|
"default":TOKEN_DEFAULT,
|
|
"return":TOKEN_RETURN,
|
|
}
|
|
|
|
_MarkTable={
|
|
"<<":TOKEN_LSH,
|
|
">>":TOKEN_RSH,
|
|
"<=":TOKEN_LEQ,
|
|
">=":TOKEN_GEQ,
|
|
"!=":TOKEN_NEQ,
|
|
"==":TOKEN_EQ,
|
|
"++":TOKEN_INC,
|
|
"--":TOKEN_DEC,
|
|
"=":TOKEN("="),
|
|
"!":TOKEN("!"),
|
|
"<":TOKEN("<"),
|
|
">":TOKEN(">"),
|
|
"+":TOKEN("+"),
|
|
"-":TOKEN("-"),
|
|
|
|
}
|
|
|
|
|
|
|
|
# 是否是数字加字母
|
|
def isalnum(num:int):
|
|
return bytes([num]).isalnum()
|
|
|
|
# 是否是数字加字母或下划线
|
|
def isalnum_(num:int):
|
|
return bytes([num]).isalnum() or num==TOKEN("_")
|
|
|
|
# 是否是字母
|
|
def isalpha(num:int):
|
|
return bytes([num]).isalpha()
|
|
|
|
# 是否是字母或下划线
|
|
def isalpha_(num:int):
|
|
return bytes([num]).isalpha() or num==TOKEN("_")
|
|
|
|
# 是否是数字
|
|
def isdigit(num:int):
|
|
return bytes([num]).isdigit()
|
|
|
|
# 是否是数字或小数点
|
|
def isdigitdot(num:int):
|
|
return bytes([num]).isdigit() or num==TOKEN(".")
|
|
|
|
# 是否是空白字符 包括换行符
|
|
def isspace(num:int):
|
|
return bytes([num]).isspace()
|
|
|
|
# 是否是给定字符串之一
|
|
def isinstr(num:int,t:str):
|
|
c=bytes([num])
|
|
return c in t.encode("utf-8")
|
|
|
|
# 是否是操作符
|
|
def isoperator(num:int):
|
|
return isinstr(num,"<>!+-=")
|
|
|
|
@dataclasses.dataclass
|
|
class lex_token:
|
|
name:str
|
|
buff:bytearray
|
|
token:int
|
|
line:int
|
|
pos:int
|
|
|
|
|
|
class lex_class(object):
|
|
def __init__(self,text:bytes) -> None:
|
|
self.text=text
|
|
self.index=-1
|
|
self.line=1
|
|
self.pos=-1
|
|
self.token_list:list[lex_token]=[]
|
|
self.token_buff=bytearray()
|
|
def save_char(self,c:int):
|
|
self.token_buff.append(c&0xff)
|
|
def save_token(self,token:lex_token):
|
|
self.token_list.append(token)
|
|
self.token_buff=bytearray()
|
|
def _get_char(self):
|
|
if(self.index<len(self.text)):
|
|
c= self.text[self.index]
|
|
return c
|
|
return -1
|
|
def get_next_char(self):
|
|
if not self.is_end():
|
|
self.index+=1
|
|
c= self._get_char()
|
|
if(c==b'\n'[0]):
|
|
self.line+=1
|
|
self.pos=-1
|
|
else:
|
|
self.pos+=1
|
|
return c
|
|
def is_end(self):
|
|
return self.index>=len(self.text)
|
|
def save_one_char_token(self,c:int):
|
|
token=lex_token(bytes([c]).decode("utf-8"),bytes([c]),c,self.line,self.pos)
|
|
self.save_token(token)
|
|
def read_name_and_save(self,c:int):
|
|
token=lex_token("symbol",bytearray(),TOKEN_SYMBOL,self.line,self.pos)
|
|
self.save_char(c)
|
|
while True:
|
|
c=self.get_next_char()
|
|
if(isalnum_(c)):
|
|
self.save_char(c)
|
|
else:
|
|
break
|
|
name=self.token_buff.decode("utf-8")
|
|
if(name in _KeyWordTable):
|
|
token.token=_KeyWordTable[name]
|
|
token.name=name
|
|
token.buff=self.token_buff
|
|
self.save_token(token)
|
|
return c
|
|
def read_operator_and_save(self,c:int):
|
|
token=lex_token("operator",bytearray(),TOKEN_SYMBOL,self.line,self.pos)
|
|
self.save_char(c)
|
|
while True:
|
|
c=self.get_next_char()
|
|
if(isoperator(c)):
|
|
self.save_char(c)
|
|
else:
|
|
break
|
|
name=self.token_buff.decode("utf-8")
|
|
if(name in _MarkTable):
|
|
token.token=_MarkTable[name]
|
|
token.name=name
|
|
else:
|
|
raise Exception(f"不存在的操作符 {name} ")
|
|
token.buff=self.token_buff
|
|
self.save_token(token)
|
|
return c
|
|
def read_num_and_save(self,c:int):
|
|
token=lex_token("number",bytearray(),TOKEN_NUM,self.line,self.pos)
|
|
self.save_char(c)
|
|
while True:
|
|
c=self.get_next_char()
|
|
if(isdigitdot(c)):
|
|
self.save_char(c)
|
|
else:
|
|
break
|
|
if(self.token_buff.count(b'.')>1):
|
|
raise Exception("数字不能包含多个点号")
|
|
token.buff=self.token_buff
|
|
self.save_token(token)
|
|
return c
|
|
def read_str_and_save(self,c:int):
|
|
c=self.get_next_char()
|
|
while c!=b'\"'[0]:
|
|
self.save_char(c)
|
|
c=self.get_next_char()
|
|
self.save_token(lex_token("string",self.token_buff,TOKEN_STRING,self.line,self.pos))
|
|
return self.get_next_char()
|
|
|
|
def lex(text:bytes):
|
|
lex_obj = lex_class(text)
|
|
c=lex_obj.get_next_char()
|
|
while not lex_obj.is_end():
|
|
if isalpha_(c):
|
|
c=lex_obj.read_name_and_save(c)
|
|
elif isinstr(c,"{}[]()~,;:*"):
|
|
lex_obj.save_one_char_token(c)
|
|
c=lex_obj.get_next_char()
|
|
elif isdigit(c):
|
|
c=lex_obj.read_num_and_save(c)
|
|
elif isspace(c):
|
|
c=lex_obj.get_next_char()
|
|
elif isoperator(c):
|
|
c=lex_obj.read_operator_and_save(c)
|
|
elif isinstr(c,"\""):
|
|
c=lex_obj.read_str_and_save(c)
|
|
else:
|
|
raise Exception(f"err char {bytes([c])} at line:{lex_obj.line} pos:{lex_obj.pos}")
|
|
# for item in lex_obj.token_list:
|
|
# print(f"{item}")
|
|
return lex_obj.token_list
|
|
|
|
if __name__ == "__main__":
|
|
with open("main.c",mode='rb') as f:
|
|
lex(f.read())
|