parser_c/lex_c.py

import os
import sys
import dataclasses


TOKEN_IF = 256,
TOKEN_BREAK = 257,
TOKEN_WHILE=258,
TOKEN_SWITCH=259,
TOKEN_CASE=260,
TOKEN_DO=261,
TOKEN_CHAR=262,
TOKEN_INT=263,
TOKEN_VOID=264,
TOKEN_SYMBOL = 265 ,
TOKEN_NUM = 266 ,# 数字
TOKEN_INC = 267,# 自增
TOKEN_DEC = 268,# 自减
TOKEN_EQ = 269,# 相等
TOKEN_NEQ = 270,# 不相等
TOKEN_LSH = 271,# 左移
TOKEN_RSH = 272,# 右移
TOKEN_LEQ = 273,# 小于等于
TOKEN_GEQ = 274,# 大于等于
TOKEN_ELSE = 275,
TOKEN_CONTINUE = 276 ,
TOKEN_CONST = 277 ,
TOKEN_STATIC = 278 ,
TOKEN_UNSIGNED = 279 ,
TOKEN_TYPEDEF = 280 ,
TOKEN_STRUCT = 281 ,
TOKEN_ENUM = 282 ,
TOKEN_UNION = 283,
TOKEN_STRING = 284,
TOKEN_DEFAULT = 285,
TOKEN_RETURN = 286,

def TOKEN(t:str):
  return t.encode("utf-8")[0]

_KeyWordTable={
  "if":TOKEN_IF,
  "else":TOKEN_ELSE,
  "break":TOKEN_BREAK,
  "while":TOKEN_WHILE,
  "switch":TOKEN_SWITCH,
  "case":TOKEN_CASE,
  "do":TOKEN_DO,
  "char":TOKEN_CHAR,
  "int":TOKEN_INT,
  "void":TOKEN_VOID,
  "continue":TOKEN_CONTINUE,
  "const":TOKEN_CONST,
  "static":TOKEN_STATIC,
  "unisgned":TOKEN_UNSIGNED,
  "typedef":TOKEN_TYPEDEF,
  "struct":TOKEN_STRUCT,
  "enum":TOKEN_ENUM,
  "union":TOKEN_UNION,
  "default":TOKEN_DEFAULT,
  "return":TOKEN_RETURN,
}

_MarkTable={
  "<<":TOKEN_LSH,
  ">>":TOKEN_RSH,
  "<=":TOKEN_LEQ,
  ">=":TOKEN_GEQ,
  "!=":TOKEN_NEQ,
  "==":TOKEN_EQ,
  "++":TOKEN_INC,
  "--":TOKEN_DEC,
  "=":TOKEN("="),
  "!":TOKEN("!"),
  "<":TOKEN("<"),
  ">":TOKEN(">"),
  "+":TOKEN("+"),
  "-":TOKEN("-"),

}


# 是否是数字加字母
def isalnum(num:int):
  return bytes([num]).isalnum()

# 是否是数字加字母或下划线
def isalnum_(num:int):
  return bytes([num]).isalnum() or num==TOKEN("_")

# 是否是字母
def isalpha(num:int):
  return bytes([num]).isalpha()

# 是否是字母或下划线
def isalpha_(num:int):
  return bytes([num]).isalpha() or num==TOKEN("_")

# 是否是数字
def isdigit(num:int):
  return bytes([num]).isdigit()

# 是否是数字或小数点
def isdigitdot(num:int):
  return bytes([num]).isdigit() or num==TOKEN(".")

# 是否是空白字符 包括换行符
def isspace(num:int):
  return bytes([num]).isspace()

# 是否是给定字符串之一
def isinstr(num:int,t:str):
  c=bytes([num])
  return c in t.encode("utf-8")

# 是否是操作符
def isoperator(num:int):
  return isinstr(num,"<>!+-=")

@dataclasses.dataclass
class lex_token:
    name:str
    buff:bytearray
    token:int
    line:int
    pos:int


class lex_class(object):
  def __init__(self,text:bytes) -> None:
    self.text=text
    self.index=-1
    self.line=1
    self.pos=-1
    self.token_list:list[lex_token]=[]
    self.token_buff=bytearray()
  def save_char(self,c:int):
    self.token_buff.append(c&0xff)
  def save_token(self,token:lex_token):
    self.token_list.append(token)
    self.token_buff=bytearray()
  def _get_char(self):
    if(self.index<len(self.text)):
      c= self.text[self.index]
      return c
    return -1
  def get_next_char(self):
    if not self.is_end():
      self.index+=1
    c= self._get_char()
    if(c==b'\n'[0]):
      self.line+=1
      self.pos=-1
    else:
      self.pos+=1
    return c
  def is_end(self):
    return self.index>=len(self.text)
  def save_one_char_token(self,c:int):
    token=lex_token(bytes([c]).decode("utf-8"),bytes([c]),c,self.line,self.pos)
    self.save_token(token)
  def read_name_and_save(self,c:int):
    token=lex_token("symbol",bytearray(),TOKEN_SYMBOL,self.line,self.pos)
    self.save_char(c)
    while True:
      c=self.get_next_char()
      if(isalnum_(c)):
        self.save_char(c)
      else:
        break
    name=self.token_buff.decode("utf-8")
    if(name in _KeyWordTable):
      token.token=_KeyWordTable[name]
      token.name=name
    token.buff=self.token_buff
    self.save_token(token)
    return c
  def read_operator_and_save(self,c:int):
    token=lex_token("operator",bytearray(),TOKEN_SYMBOL,self.line,self.pos)
    self.save_char(c)
    while True:
      c=self.get_next_char()
      if(isoperator(c)):
        self.save_char(c)
      else:
        break
    name=self.token_buff.decode("utf-8")
    if(name in _MarkTable):
      token.token=_MarkTable[name]
      token.name=name
    else:
      raise Exception(f"不存在的操作符 {name} ")
    token.buff=self.token_buff
    self.save_token(token)
    return c
  def read_num_and_save(self,c:int):
    token=lex_token("number",bytearray(),TOKEN_NUM,self.line,self.pos)
    self.save_char(c)
    while True:
      c=self.get_next_char()
      if(isdigitdot(c)):
        self.save_char(c)
      else:
        break
    if(self.token_buff.count(b'.')>1):
      raise Exception("数字不能包含多个点号")
    token.buff=self.token_buff
    self.save_token(token)
    return c
  def read_str_and_save(self,c:int):
    c=self.get_next_char()
    while c!=b'\"'[0]:
      self.save_char(c)
      c=self.get_next_char()
    self.save_token(lex_token("string",self.token_buff,TOKEN_STRING,self.line,self.pos))
    return self.get_next_char()

def lex(text:bytes):
  lex_obj = lex_class(text)
  c=lex_obj.get_next_char()
  while not lex_obj.is_end():
    if isalpha_(c):
      c=lex_obj.read_name_and_save(c)
    elif isinstr(c,"{}[]()~,;:*"):
      lex_obj.save_one_char_token(c)
      c=lex_obj.get_next_char()
    elif isdigit(c):
      c=lex_obj.read_num_and_save(c)
    elif isspace(c):
      c=lex_obj.get_next_char()
    elif isoperator(c):
      c=lex_obj.read_operator_and_save(c)
    elif isinstr(c,"\""):
      c=lex_obj.read_str_and_save(c)
    else:
      raise Exception(f"err char {bytes([c])} at line:{lex_obj.line} pos:{lex_obj.pos}")
  # for item in lex_obj.token_list:
  #   print(f"{item}")
  return lex_obj.token_list

if __name__ == "__main__":
  with open("main.c",mode='rb') as f:
    lex(f.read())