c_soft/soft/clexical.c

#include "stdio.h"
#include "regex.h"
#include "stdlib.h"
#include "stdint.h"
#include "exception.h"
#include "mythread.h"
#include "unistd.h"
#include "debug.h"
#include "string.h"
#include "ctype.h"
#include "mystdlib.h"


// c语言词法分析

typedef enum {
    TOKEN_IF = 256,
    TOKEN_BREAK = 257,
    TOKEN_WHILE=258,
    TOKEN_SWITCH=259,
    TOKEN_CASE=260,
    TOKEN_DO=261,
    TOKEN_CHAR=262,
    TOKEN_INT=263,
    TOKEN_VOID=264,
    TOKEN_NAME = 265 ,
    TOKEN_NUM = 266 ,// 数字
    TOKEN_INC = 267,// 自增
    TOKEN_DEC = 268,// 自减
    TOKEN_EQ = 269,// 相等
    TOKEN_NEQ = 270,// 不相等
    TOKEN_LSH = 271,// 左移
    TOKEN_RSH = 272,// 右移
    TOKEN_LEQ = 273,// 小于等于
    TOKEN_GEQ = 274,// 大于等于
    TOKEN_ELSE = 275,
    TOKEN_CONTINUE = 276 ,
    TOKEN_CONST = 277 ,
    TOKEN_STATIC = 278 ,
    TOKEN_UNSIGNED = 279 ,
    TOKEN_TYPEDEF = 280 ,
    TOKEN_STRUCT = 281 ,
    TOKEN_ENUM = 282 ,
    TOKEN_UNION = 283,
}token_index_def;


typedef struct {
    const char* key;
    const token_index_def token;
    const char* token_str;
}keywork_item_def;

#define TOKEN_DEF(s,t)  {#s,t,#t}

const keywork_item_def g_keyword_table[ ] = {
    TOKEN_DEF(if,TOKEN_IF),
    TOKEN_DEF(else,TOKEN_ELSE),
    TOKEN_DEF(break,TOKEN_IF),
    TOKEN_DEF(while,TOKEN_WHILE),
    TOKEN_DEF(switch,TOKEN_SWITCH),
    TOKEN_DEF(case,TOKEN_CASE),
    TOKEN_DEF(do,TOKEN_DO),
    TOKEN_DEF(char,TOKEN_CHAR),
    TOKEN_DEF(int,TOKEN_INT),
    TOKEN_DEF(void,TOKEN_VOID),
    TOKEN_DEF(continue,TOKEN_CONTINUE),
    TOKEN_DEF(const,TOKEN_CONST),
    TOKEN_DEF(static,TOKEN_STATIC),
    TOKEN_DEF(unsigned,TOKEN_UNSIGNED),
    TOKEN_DEF(typedef,TOKEN_TYPEDEF),
    TOKEN_DEF(struct,TOKEN_STRUCT),
    TOKEN_DEF(enum,TOKEN_ENUM),
    TOKEN_DEF(union,TOKEN_UNION),
    {NULL},
};


#define TOKEN_BUFF_MAX_LEN      128

// 字母 下划线
#define cislalpha(c)	(isalpha(c) || (c) == '_')
// 数字 字母 下划线
#define cislalnum(c)	(isalnum(c) || (c) == '_')
// 数字
#define cisdigit(c)	(isdigit(c))
// 空白
#define cisspace(c)	(isspace(c))
// 可打印字符
#define cisprint(c)	(isprint(c))
// 16进制数字
#define cisxdigit(c)	(isxdigit(c))
// 转换为小写
#define ctolower(c)	(tolower(c))


typedef struct{
    char buff[TOKEN_BUFF_MAX_LEN];
    int used;
    int line;
    int pos;
    int token;
}token_def;


typedef struct _token_list {
    token_def token;
    struct _token_list* next;
}token_list_node_def;


typedef struct {
    token_list_node_def* head;
    token_list_node_def* current;
    int len;
}token_list_def;


typedef struct {
    int current_c;
    int current_line;
    int current_line_pos;
    token_def token_buff;
    const char *input_text;
    int input_len;
    int input_pos;
    token_list_def tlist;
}lex_def;


// 对比关键字 返回其token
int lex_compare_keywords(const char* key) {
    const keywork_item_def* ckey = g_keyword_table;
    int index = 0;
    while (ckey[index].key != NULL) {
        if (strcmp(ckey[index].key , key) == 0) {
            return ckey[index].token;
        }
        index++;
    }
    return -1;
}


// 获取下一个字符 返回0成功
int lex_get_next(lex_def *lex)
{
    if (lex->input_pos >= lex->input_len) {
        lex->current_c = 0;
        return -1;
    }
    lex->current_c=lex->input_text[lex->input_pos];
    lex->input_pos++;
    lex->current_line_pos++;
    return 0;
}


// 保存当前字符
int lex_save_char(lex_def *lex){
    token_def *t=&lex->token_buff;
    if(t->used>=TOKEN_BUFF_MAX_LEN){
        return -1;
    }
    if (t->used == 0) {
        t->pos = lex->current_line_pos;
        t->line=lex->current_line;
    }
    t->buff[t->used]=lex->current_c;
    t->used++;
    return 0;
}


// 保存一个token
int lex_save_token(lex_def* lex) {
    token_list_node_def* t = mem_calloc(1 , sizeof(token_list_node_def));
    token_list_def* l = &lex->tlist;
    memcpy(&t->token , &lex->token_buff , sizeof(token_def));
    memset(&lex->token_buff , 0 , sizeof(token_def));
    if (l->head == NULL) {
        l->head = t;
    } else {
        l->current->next = t;
        if (l->head->next == NULL) {
            l->head->next = t;
        }
    }
    l->current = t;
    l->len++;
    return 0;
}


// 删除 token list
int lex_del_token_list(lex_def* lex) {
    token_list_def* l = &lex->tlist;
    token_list_node_def* t;
    while (l->len > 0) {
        t = l->head->next;
        mem_free(l->head);
        l->head = t;
        l->len--;
    }
    l->head = NULL;
    l->current = NULL;
    return 0;
}

// 把 token list 保存到一个数组 len 保存数组的长度
token_def* lex_cpy_token_list(lex_def* lex,int *len) {
    token_list_def* l = &lex->tlist;
    token_list_node_def* t;
    token_def* r = mem_calloc(l->len , sizeof(token_def));
    if (len) *len = l->len;
    t = l->head;
    for (int i = 0;i < l->len;i++) {
        memcpy(&r [i] , &t->token , sizeof(token_def));
        t = t->next;
    }
    return r;
}


// 打印 token list
int lex_print_token_list(lex_def* lex) {
    token_list_def* l = &lex->tlist;
    token_list_node_def* t;
    t = l->head;
    while (t) {
        printf("%4d[%3d,%3d],token=%4d   \"%s\"\n" , t->token.line , t->token.pos ,
            t->token.used , t->token.token, t->token.buff );
        t = t->next;
    }
    return 0;
}

int par_parser(token_def* token_list , int len);
int lex_analysis(const char *text,int len){
    lex_def lex = { 0 };
    int in_loop = 1;
    lex.input_text = text;
    lex.input_len = len;
    lex_get_next(&lex);
    lex.current_line_pos = 0;
    while (in_loop) {
        int _char = lex.current_c;
        switch (_char)
        {
        case 0: {
            in_loop = 0;
            break;
        }
        case '\r': {
            lex_get_next(&lex);
        }
        case '\n': {
            if(lex.current_c=='\n'){
                lex_get_next(&lex);
            }
            lex.current_line++;
            lex.current_line_pos=0;
            break;
        }
        case ' ':
        case '\t':
        case '\v':
        case '\f': {
            lex_get_next(&lex);
            break;
        }
        case '(':
        case ')':
        case '{':
        case '}':
        case '[':
        case ']':
        case '~':
        case ',':
        case ';':
        case ':':
        {
            lex_save_char(&lex);
            lex.token_buff.token = _char;
            lex_save_token(&lex);
            lex_get_next(&lex);
            break;
        }
        case '0':
        case '1':
        case '2':
        case '3':
        case '4':
        case '5':
        case '6':
        case '7':
        case '8':
        case '9':{
            if (cisdigit(lex.current_c)) {
                do {
                    lex_save_char(&lex);
                    lex_get_next(&lex);
                } while (cisdigit(lex.current_c));
                lex.token_buff.token = TOKEN_NUM;
                lex_save_token(&lex);
            }
            break;
        }
        case '+':{
            lex_save_char(&lex);
            lex_get_next(&lex);
            if(lex.current_c=='+'){
                lex_save_char(&lex);
                lex_get_next(&lex);
                lex.token_buff.token = TOKEN_INC;
            }else{
                lex.token_buff.token = _char;
            }
            lex_save_token(&lex);
            break;
        }
        case '-':{
            lex_save_char(&lex);
            lex_get_next(&lex);
            if(lex.current_c=='-'){
                lex_save_char(&lex);
                lex_get_next(&lex);
                lex.token_buff.token = TOKEN_DEC;
            }else{
                lex.token_buff.token = _char;
            }
            lex_save_token(&lex);
            break;
        }
        case '!':
        case '>':
        case '<':
        case '=': {
            lex_save_char(&lex);
            lex_get_next(&lex);
            if(lex.current_c=='='){
                lex_save_char(&lex);
                lex_get_next(&lex);
                switch (_char)
                {
                case '!':lex.token_buff.token = TOKEN_NEQ;break;
                case '>':lex.token_buff.token = TOKEN_GEQ;break;
                case '<':lex.token_buff.token = TOKEN_LEQ;break;
                case '=':lex.token_buff.token = TOKEN_EQ;break;
                default:
                DBG_ERR("未知的情况('%c'): 在 %d 行" , lex.current_c , lex.current_line);
                    break;
                }
            }else{
                lex.token_buff.token = _char;
            }
            lex_save_token(&lex);
            break;
        }
        default:
            if (cislalpha(lex.current_c)) {
                do {
                    lex_save_char(&lex);
                    lex_get_next(&lex);
                } while (cislalnum(lex.current_c));
                int token = lex_compare_keywords(lex.token_buff.buff);
                if (token != -1) {
                    lex.token_buff.token = token;
                } else {
                    lex.token_buff.token = TOKEN_NAME;
                }
                lex_save_token(&lex);
            } else {
                DBG_ERR("未知的符号('%c'): 在 %d 行" , lex.current_c , lex.current_line);
                in_loop = 0;
            }
            break;
        }
    }
    lex_print_token_list(&lex);
    int token_len=0;
    token_def *tlist=lex_cpy_token_list(&lex , &token_len);
    lex_del_token_list(&lex);
    // par_parser(tlist , token_len);
    mem_free(tlist);
    DBG_LOG("解析结束.");
    return 0;
}


// c语言语义分析

// 变量类型
typedef enum {
    VTYPE_VOID = 0 ,
    VTYPE_CHAR = 1 ,
    VTYPE_SHORT = 2 ,
    VTYPE_INT = 3 ,
    VTYPE_PTR = 4 ,
    VTYPE_FUNC = 5 ,
}variable_type_def;

// 语句类型
typedef enum {
    STYPE_NONE = 0,// 代码文件
    STYPE_TYPEDEF = 1 ,// 类型定义 typedef
    STYPE_VDECLARE = 2 ,// 变量声明
    STYPE_FDECLARE = 3 ,// 函数声明
    STYPE_STRUCT = 4 ,// 结构体语句
    STYPE_ENUM = 5 ,// 枚举语句
    STYPE_UNION = 6 ,// 联合体语句
    STYPE_FDEF = 7 ,// 函数定义
    STYPE_PARLIST = 8 ,// 参数列表
    STYPE_BLOCK = 9,// 代码块
}sentence_type_def;

// 属性类型 可以组合
typedef enum {
    ATYPE_VOID = 0 ,// 无属性
    ATYPE_STATIC = 1 ,// 静态
    ATYPE_CONST = 2 ,//只读
    ATYPE_EXTERN = 4 ,// 外部
    ATYPE_UNSIGNED = 8,// 无符号
}attr_def;


// token类型
typedef enum {
    TTYPE_NONE = 0,
    TTYPE_ATTR = 1 ,// 属性声明关键字
    TTYPE_TYPE = 2 ,// 类型声明关键字
    TTYPE_PROC = 3 ,// 流程控制关键字
    TTYPE_OP   = 4,// 操作关键字
}token_type_def;


// 变量的描述
typedef struct _describe_var {
    int attribute;// 属性 静态 只读 无符号 等
    int type;// 类型
    char der_name[TOKEN_BUFF_MAX_LEN];// 派生类型的名称 结构体 联合体等
    void* value;// 变量的值
}describe_var_def;

// 函数类型的描述
typedef struct _describe_fun {
    int attribute;// 函数类型
    int type;// 返回值类型
    int par_num;// 参数个数
    describe_var_def* par_list;// 参数列表
}describe_fun_def;

typedef union _describe_union
{
    describe_var_def v;
    describe_fun_def f;
}describe_def;

// 获取各属性的宏
#define des_get_value(des,v,a) (des)->v.a


typedef struct _par_def
{
    int attribute;// 属性 静态 只读 无符号 等
    int type;// 类型 声明 定义 变量 函数 等
    char name [TOKEN_BUFF_MAX_LEN];// 变量名称
    describe_def des;// 描述语句内容
    token_def* token_list; // 此语句包含的token起始
    int token_list_len;// 此语句tokenlist的长度
    int child_num;// 子语句数量
    struct _par_def* child;// 子语句
    struct _par_def* next;// 下一个语句
}par_def;


// 找到成对闭合的符号 返回 token 长度
int par_find_closed(token_def* t_list , int len , int token_s , int token_e) {
    if (len < 1) {
        return 0;
    }
    if (t_list [0].token != token_s) {
        throw_("token_list 不以 \"%c\" 开头，在 %d 行，%d 位置" , token_s , t_list [0].line , t_list [0].pos);
    }
    int token_close = 0;
    for (int i = 0;i < len;i++) {
        if (t_list [i].token == token_s) {
            token_close++;
        } else if (t_list [i].token == token_e) {
            token_close--;
        }
        if (token_close == 0) {
            return i+1;
        }
    }
    throw_("缺少成对的符号，第一个符号 \"%c\" 在 %d 行，%d 位置" , token_s , t_list [0].line , t_list [0].pos);
    return -1;
}

// 解析形参列表
int par_var_def( ) {
    return 0;
}


// 解析一个变量声明 函数声明 或函数定义
// 这个函数在 TOKEN_NAME 的时候调用 保证第一个token_def 为 TOKEN_NAME
// 返回 消耗的token 数
int par_var_fun_def(par_def* par , token_def* t_list , int len) {
    if (len < 2) {
        throw_("缺少后续token，在 %d 行，%d 位置" , t_list [0].line , t_list [0].pos);
    }
    if (t_list [1].token == ';') {
        // 变量声明
        par->type = STYPE_VDECLARE;
        par->token_list_len = 0;
        if (par->token_list) {
            throw_("出现意外的token_list，在 %d 行，%d 位置" , t_list [0].line , t_list [0].pos);
        }
    } else if (t_list [1].token == '(') {
        // 可能是函数调用或者函数定义 函数声明
        t_list++;len--;
        int close_len = 0;
        close_len = par_find_closed(t_list , len , '(' , ')');
        if (close_len >= len ) {
            throw_("缺少后续token，在 %d 行，%d 位置" , t_list [close_len-1 ].line , t_list [close_len-1 ].pos);
        }
        if (t_list [close_len + 1].token == '{') {
            // 是函数定义
            par->type = STYPE_FDEF;
        } else if (t_list [close_len + 1].token == ';') {
            // 函数声明
            par->type = STYPE_FDECLARE;
        } else {
            throw_("意外的token，在 %d 行，%d 位置" , t_list [close_len + 1].line , t_list [close_len + 1].pos);
        }
    }
    return 0;
}


// 解析一个范围内的token list
int par_statement(par_def* p) {
    token_def* t = p->token_list;
    par_def* par = mem_calloc(1 , sizeof(par_def));
    p->child = par;
    p->child_num++;
    int type = -1;
    for (int i = 0;i < p->token_list_len;i++) {
        int ttype = t [i].token;
        switch (ttype) {
        case TOKEN_CHAR:
            type = VTYPE_CHAR;
            break;
        case TOKEN_INT:
            type = VTYPE_INT;
            break;
        case TOKEN_VOID:
            type = VTYPE_VOID;
            break;
        case TOKEN_CONST:
            par->attribute |= ATYPE_CONST;
            break;
        case TOKEN_STATIC:
            par->attribute |= ATYPE_STATIC;
            break;
        case TOKEN_UNSIGNED:
            par->attribute |= ATYPE_UNSIGNED;
            break;
        case TOKEN_NAME: {
            memcpy(par->name , t [i].buff , t [i].used);
            if (type >= 0) {
                // 有类型 应该是变量声明 函数声明 或函数定义
                i+=par_var_fun_def(par,&t[i],p->token_list_len-i);
            } else {
                // 是函数调用 指针调用 变量赋值
            }
            // 到这里一个语句解析完了 开始解析下一个语句
            par->next = mem_calloc(1 , sizeof(par_def));
            par = par->next;
            p->child_num++;
            break;
        }
        default:
            break;
        }
    }
    return 0;
}


// 输入token数组和长度
int par_parser(token_def* token_list , int len) {
    par_def* par = mem_calloc(1 , sizeof(par_def));
    par->type = TTYPE_NONE;
    par->attribute = 0;
    par->token_list = token_list;
    par->token_list_len = len;
    par_statement(par);
    return 0;
}