Files
parser_c/lex_c.py
2024-11-29 19:11:43 +08:00

245 lines
5.6 KiB
Python

import os
import sys
import dataclasses
TOKEN_IF = 256,
TOKEN_BREAK = 257,
TOKEN_WHILE=258,
TOKEN_SWITCH=259,
TOKEN_CASE=260,
TOKEN_DO=261,
TOKEN_CHAR=262,
TOKEN_INT=263,
TOKEN_VOID=264,
TOKEN_SYMBOL = 265 ,
TOKEN_NUM = 266 ,# 数字
TOKEN_INC = 267,# 自增
TOKEN_DEC = 268,# 自减
TOKEN_EQ = 269,# 相等
TOKEN_NEQ = 270,# 不相等
TOKEN_LSH = 271,# 左移
TOKEN_RSH = 272,# 右移
TOKEN_LEQ = 273,# 小于等于
TOKEN_GEQ = 274,# 大于等于
TOKEN_ELSE = 275,
TOKEN_CONTINUE = 276 ,
TOKEN_CONST = 277 ,
TOKEN_STATIC = 278 ,
TOKEN_UNSIGNED = 279 ,
TOKEN_TYPEDEF = 280 ,
TOKEN_STRUCT = 281 ,
TOKEN_ENUM = 282 ,
TOKEN_UNION = 283,
TOKEN_STRING = 284,
TOKEN_DEFAULT = 285,
TOKEN_RETURN = 286,
def TOKEN(t:str):
return t.encode("utf-8")[0]
_KeyWordTable={
"if":TOKEN_IF,
"else":TOKEN_ELSE,
"break":TOKEN_BREAK,
"while":TOKEN_WHILE,
"switch":TOKEN_SWITCH,
"case":TOKEN_CASE,
"do":TOKEN_DO,
"char":TOKEN_CHAR,
"int":TOKEN_INT,
"void":TOKEN_VOID,
"continue":TOKEN_CONTINUE,
"const":TOKEN_CONST,
"static":TOKEN_STATIC,
"unisgned":TOKEN_UNSIGNED,
"typedef":TOKEN_TYPEDEF,
"struct":TOKEN_STRUCT,
"enum":TOKEN_ENUM,
"union":TOKEN_UNION,
"default":TOKEN_DEFAULT,
"return":TOKEN_RETURN,
}
_MarkTable={
"<<":TOKEN_LSH,
">>":TOKEN_RSH,
"<=":TOKEN_LEQ,
">=":TOKEN_GEQ,
"!=":TOKEN_NEQ,
"==":TOKEN_EQ,
"++":TOKEN_INC,
"--":TOKEN_DEC,
"=":TOKEN("="),
"!":TOKEN("!"),
"<":TOKEN("<"),
">":TOKEN(">"),
"+":TOKEN("+"),
"-":TOKEN("-"),
}
# 是否是数字加字母
def isalnum(num:int):
return bytes([num]).isalnum()
# 是否是数字加字母或下划线
def isalnum_(num:int):
return bytes([num]).isalnum() or num==TOKEN("_")
# 是否是字母
def isalpha(num:int):
return bytes([num]).isalpha()
# 是否是字母或下划线
def isalpha_(num:int):
return bytes([num]).isalpha() or num==TOKEN("_")
# 是否是数字
def isdigit(num:int):
return bytes([num]).isdigit()
# 是否是数字或小数点
def isdigitdot(num:int):
return bytes([num]).isdigit() or num==TOKEN(".")
# 是否是空白字符 包括换行符
def isspace(num:int):
return bytes([num]).isspace()
# 是否是给定字符串之一
def isinstr(num:int,t:str):
c=bytes([num])
return c in t.encode("utf-8")
# 是否是操作符
def isoperator(num:int):
return isinstr(num,"<>!+-=")
@dataclasses.dataclass
class lex_token:
name:str
buff:bytearray
token:int
line:int
pos:int
class lex_class(object):
def __init__(self,text:bytes) -> None:
self.text=text
self.index=-1
self.line=1
self.pos=-1
self.token_list:list[lex_token]=[]
self.token_buff=bytearray()
def save_char(self,c:int):
self.token_buff.append(c&0xff)
def save_token(self,token:lex_token):
self.token_list.append(token)
self.token_buff=bytearray()
def _get_char(self):
if(self.index<len(self.text)):
c= self.text[self.index]
return c
return -1
def get_next_char(self):
if not self.is_end():
self.index+=1
c= self._get_char()
if(c==b'\n'[0]):
self.line+=1
self.pos=-1
else:
self.pos+=1
return c
def is_end(self):
return self.index>=len(self.text)
def save_one_char_token(self,c:int):
token=lex_token(bytes([c]).decode("utf-8"),bytes([c]),c,self.line,self.pos)
self.save_token(token)
def read_name_and_save(self,c:int):
token=lex_token("symbol",bytearray(),TOKEN_SYMBOL,self.line,self.pos)
self.save_char(c)
while True:
c=self.get_next_char()
if(isalnum_(c)):
self.save_char(c)
else:
break
name=self.token_buff.decode("utf-8")
if(name in _KeyWordTable):
token.token=_KeyWordTable[name]
token.name=name
token.buff=self.token_buff
self.save_token(token)
return c
def read_operator_and_save(self,c:int):
token=lex_token("operator",bytearray(),TOKEN_SYMBOL,self.line,self.pos)
self.save_char(c)
while True:
c=self.get_next_char()
if(isoperator(c)):
self.save_char(c)
else:
break
name=self.token_buff.decode("utf-8")
if(name in _MarkTable):
token.token=_MarkTable[name]
token.name=name
else:
raise Exception(f"不存在的操作符 {name} ")
token.buff=self.token_buff
self.save_token(token)
return c
def read_num_and_save(self,c:int):
token=lex_token("number",bytearray(),TOKEN_NUM,self.line,self.pos)
self.save_char(c)
while True:
c=self.get_next_char()
if(isdigitdot(c)):
self.save_char(c)
else:
break
if(self.token_buff.count(b'.')>1):
raise Exception("数字不能包含多个点号")
token.buff=self.token_buff
self.save_token(token)
return c
def read_str_and_save(self,c:int):
c=self.get_next_char()
while c!=b'\"'[0]:
self.save_char(c)
c=self.get_next_char()
self.save_token(lex_token("string",self.token_buff,TOKEN_STRING,self.line,self.pos))
return self.get_next_char()
def lex(text:bytes):
lex_obj = lex_class(text)
c=lex_obj.get_next_char()
while not lex_obj.is_end():
if isalpha_(c):
c=lex_obj.read_name_and_save(c)
elif isinstr(c,"{}[]()~,;:*"):
lex_obj.save_one_char_token(c)
c=lex_obj.get_next_char()
elif isdigit(c):
c=lex_obj.read_num_and_save(c)
elif isspace(c):
c=lex_obj.get_next_char()
elif isoperator(c):
c=lex_obj.read_operator_and_save(c)
elif isinstr(c,"\""):
c=lex_obj.read_str_and_save(c)
else:
raise Exception(f"err char {bytes([c])} at line:{lex_obj.line} pos:{lex_obj.pos}")
# for item in lex_obj.token_list:
# print(f"{item}")
return lex_obj.token_list
if __name__ == "__main__":
with open("main.c",mode='rb') as f:
lex(f.read())