parser_c/parser_c.py

import os
import sys
import dataclasses
from lex_c import lex_token
from lex_c import lex
import lex_c

_NodeTypeTable=[
  "file","vdecl","fdef"
]


@dataclasses.dataclass
class node:
  name:list[str]=dataclasses.field(default_factory=list)
  type:str="base"
  token_list:list[lex_token]=dataclasses.field(default_factory=list)
  child:list=dataclasses.field(default_factory=list)
# 文件节点
@dataclasses.dataclass
class node_file(node):
  type:str="file"

# 变量定义节点
@dataclasses.dataclass
class node_variable_def(node):
  type:str="variable_def"

# 结构体声明节点
@dataclasses.dataclass
class node_struct_decl(node):
  type:str="struct_decl"

# 结构体定义节点
@dataclasses.dataclass
class node_struct_def(node):
  type:str="struct_def"

# 联合体声明节点
@dataclasses.dataclass
class node_union_decl(node):
  type:str="union_decl"

# 联合体定义节点
@dataclasses.dataclass
class node_union_def(node):
  type:str="union_def"

# 枚举声明节点
@dataclasses.dataclass
class node_enum_decl(node):
  type:str="enum_decl"

# 枚举定义节点
@dataclasses.dataclass
class node_enum_def(node):
  type:str="enum_def"

# 函数声明节点
@dataclasses.dataclass
class node_func_decl(node):
  type:str="func_decl"

#typedef 节点
@dataclasses.dataclass
class node_typedef(node):
  type:str="typedef"

# 函数定义节点
@dataclasses.dataclass
class node_func_def(node):
  type:str="func_def"


# 找到闭合的括号
def find_close(token_list:list[lex_token],token:tuple[int,int]):
  if token_list[0].token!=token[0]:
    return 0
  num=0
  for index,item in enumerate(token_list):
    if(item.token==token[0]):
      num+=1
    elif(item.token==token[1]):
      num-=1
    if(num==0):
      return index
  raise Exception(f"没有找到闭合的符号 {token[1]}")

# 找到一个完整的语句
def find_sentence(token_list:list[lex_token]):
  bracket_flag=False
  index=0
  while index<len(token_list):
    if(token_list[index].token==lex_c.TOKEN("(")):
      bracket_index=find_close(token_list[index:],(lex_c.TOKEN("("),lex_c.TOKEN(")")))
      if(bracket_index>0):
        bracket_flag=True
        index+=bracket_index
    elif(token_list[index].token==lex_c.TOKEN("{")):
      bracket_index=find_close(token_list[index:],(lex_c.TOKEN("{"),lex_c.TOKEN("}")))
      if(bracket_index>0):
        index+=bracket_index
        if(bracket_flag==True):
          return token_list[:index+1]
    elif(token_list[index].token==lex_c.TOKEN(";")):
      return token_list[:index+1]
    index+=1
  raise Exception(f"没有找到完整的语句")


def dist_node_type_struct(token_list:list[lex_token]):
  if(token_list[0].token==lex_c.TOKEN_STRUCT):
    if(token_list[1].token==lex_c.TOKEN_SYMBOL):
      if(len(token_list)==2):
        return node_struct_decl(name=token_list[1].buff.decode("utf-8"),token_list=token_list)
      elif(token_list[2].token==lex_c.TOKEN("{")):
        if not token_list[-1].token==lex_c.TOKEN("}"):
          raise Exception("没有出现预期的符号 '}'")
        v_list:list[node_variable_def]=[]
        token_list_local=token_list[3:-1]
        while len(token_list_local)>0:
          sentence=find_sentence(token_list_local)
          v_list.append(dist_node_type(token_list=sentence))
          token_list_local=token_list_local[len(sentence):]
        return node_struct_def(name=token_list[1].buff.decode("utf-8"),token_list=token_list,body=v_list)
  raise Exception(f"语法错误 {token_list[0]}")


def dist_node_type_union(token_list:list[lex_token]):
  if(token_list[0].token==lex_c.TOKEN_UNION):
    if(token_list[1].token==lex_c.TOKEN_SYMBOL):
      if(len(token_list)==2):
        return node_union_decl(name=token_list[1].buff.decode("utf-8"),token_list=token_list)
      elif(token_list[2].token==lex_c.TOKEN("{")):
        if not token_list[-1].token==lex_c.TOKEN("}"):
          raise Exception("没有出现预期的符号 '}'")
        v_list:list[node_variable_def]=[]
        token_list_local=token_list[3:-1]
        while len(token_list_local)>0:
          sentence=find_sentence(token_list_local)
          v_list.append(dist_node_type(token_list=sentence))
          token_list_local=token_list_local[len(sentence):]
        return node_union_def(name=token_list[1].buff.decode("utf-8"),token_list=token_list,body=v_list)
  raise Exception(f"语法错误 {token_list[0]}")


def dist_node_type_enum(token_list:list[lex_token]):
  if(token_list[0].token==lex_c.TOKEN_ENUM):
    if(token_list[1].token==lex_c.TOKEN_SYMBOL):
      if(len(token_list)==2):
        return node_enum_decl(name=token_list[1].buff.decode("utf-8"),token_list=token_list)
      elif(token_list[2].token==lex_c.TOKEN("{")):
        if not token_list[-1].token==lex_c.TOKEN("}"):
          raise Exception("没有出现预期的符号 '}'")
        token_list_local=token_list[3:-1]
        index=0
        v_list:list[dict]=[]
        while len(token_list_local)>0:
          if(token_list_local[0].token==lex_c.TOKEN_SYMBOL):
            key=token_list_local[0].buff.decode("utf-8")
            if(token_list_local[1].token==lex_c.TOKEN("=") and token_list_local[2].token==lex_c.TOKEN_NUM):
              index=int(token_list_local[2].buff.decode("utf-8"))
              token_list_local=token_list_local[3:]
            else:
              index+=1
              token_list_local=token_list_local[1:]
            v_list.append({key:index})
          if(len(token_list_local)>0):
            if(token_list_local[0].token!=lex_c.TOKEN(",")):
              raise Exception(f"枚举类型应该使用 ',' 分隔符")
            token_list_local=token_list_local[1:]
        return node_enum_def(name=token_list[1].buff.decode("utf-8"),token_list=token_list,body=v_list)
  raise Exception(f"语法错误 {token_list[0]}")


def dist_node_type_typedef(token_list:list[lex_token]):
  if(token_list[0].token==lex_c.TOKEN_TYPEDEF):
    attr=[]
    token_list_local=token_list
    if(token_list[-1].token!=lex_c.TOKEN_SYMBOL):
      raise Exception(f"没有定义新类型 {token_list[-1]}")
    name=token_list[-1].buff.decode("utf-8")
    token_list=token_list[1:]
    while token_list[0].token in [lex_c.TOKEN_UNSIGNED,lex_c.TOKEN_CONST]:
      attr.append(token_list[0].name)
      token_list=token_list[1:]
    if(token_list[0].token==lex_c.TOKEN_STRUCT or token_list[0].token==lex_c.TOKEN_UNION):
      attr.append(token_list[0].name)
      if(token_list[1].token==lex_c.TOKEN_SYMBOL):
        node_r=None
        attr.append(token_list[1].buff.decode("utf-8"))
        if(token_list[2].token==lex_c.TOKEN("{")):
          node_r=dist_node_type(token_list=token_list[1:-1])
        elif(token_list[2].token==lex_c.TOKEN("*")):
          attr.append(token_list[2].name)
        return node_typedef(name=name,token_list=token_list_local,child=node_r)
    if(token_list[0].token==lex_c.TOKEN_SYMBOL):
      # 使用typedef 定义过的自定义类型
      attr.append(token_list[0].buff.decode("utf-8"))
      token_list=token_list[1:]
    else:
      # c语言预设类型
      while(token_list[0].token in
        [lex_c.TOKEN_INT,lex_c.TOKEN_CHAR,lex_c.TOKEN_SHORT,lex_c.TOKEN_LONG,lex_c.TOKEN_FLOAT,
         lex_c.TOKEN_DOUBLE,lex_c.TOKEN_VOID,lex_c.TOKEN("*")]):
        attr.append(token_list[0].name)
        token_list=token_list[1:]
    if(len(token_list)>1):
      raise Exception(f"意外的token {token_list[0]}")
    return node_typedef(name=name,token_list=token_list_local,attr=attr,body=None)
  raise Exception(f"语法错误 {token_list[0]}")


# 判断一个语句的类型
def dist_node_type(token_list:list[lex_token]):
  if(token_list[0].token==lex_c.TOKEN_EXTERN):
    token_list=token_list[1:]
  if(token_list[-1].token==lex_c.TOKEN(";")):
    token_list=token_list[:-1]
  if(token_list[0].token==lex_c.TOKEN_STRUCT):
    return dist_node_type_struct(token_list=token_list)
  if(token_list[0].token==lex_c.TOKEN_UNION):
    return dist_node_type_union(token_list=token_list)
  if(token_list[0].token==lex_c.TOKEN_ENUM):
    return dist_node_type_enum(token_list=token_list)
  if(token_list[0].token==lex_c.TOKEN_TYPEDEF):
    return dist_node_type_typedef(token_list=token_list)

  raise Exception(f"无法处理的token类型 {token_list[0]}")


if __name__ == "__main__":
  file_name="main.c"
  with open(file_name,mode='rb') as f:
    token_list=lex(f.read())
  file=node_file(name=file_name,token_list=token_list,child=[])
  while len(token_list)>0:
    sentence=find_sentence(token_list)
    node_d=dist_node_type(sentence)
    file.child.append(node_d)
    print('找到一个语句：')
    for item in sentence:
      print(f"\t{item}")
    token_list=token_list[len(sentence):]