c_soft/soft/clexical.c

#include "stdio.h"
#include "regex.h"
#include "stdlib.h"
#include "stdint.h"
#include "exception.h"
#include "mythread.h"
#include "unistd.h"
#include "debug.h"
#include "string.h"
#include "ctype.h"


// c语言词法分析

typedef enum {
    TOKEN_IF = 256,
    TOKEN_BREAK = 257,
    TOKEN_WHILE=258,
    TOKEN_SWITCH=259,
    TOKEN_CASE=260,
    TOKEN_DO=261,
    TOKEN_CHAR=262,
    TOKEN_INT=263,
    TOKEN_VOID=264,
    TOKEN_NAME = 265 ,
    TOKEN_NUM = 266 ,// 数字
    TOKEN_INC = 267,// 自增
    TOKEN_DEC = 268,// 自减
    TOKEN_EQ = 269,// 相等
    TOKEN_NEQ = 270,// 不相等
    TOKEN_LSH = 271,// 左移
    TOKEN_RSH = 272,// 右移
    TOKEN_LEQ = 273,// 小于等于
    TOKEN_GEQ = 274,// 大于等于
    TOKEN_ELSE = 275,
    TOKEN_CONTINUE = 276,
}token_index_def;


typedef struct {
    const char* key;
    const token_index_def token;
    const char* token_str;
}keywork_item_def;

#define TOKEN_DEF(s,t)  {#s,t,#t}

const keywork_item_def g_keyword_table[ ] = {
    TOKEN_DEF(if,TOKEN_IF),
    TOKEN_DEF(else,TOKEN_ELSE),
    TOKEN_DEF(break,TOKEN_IF),
    TOKEN_DEF(while,TOKEN_WHILE),
    TOKEN_DEF(switch,TOKEN_SWITCH),
    TOKEN_DEF(case,TOKEN_CASE),
    TOKEN_DEF(do,TOKEN_DO),
    TOKEN_DEF(char,TOKEN_CHAR),
    TOKEN_DEF(int,TOKEN_INT),
    TOKEN_DEF(void,TOKEN_VOID),
    TOKEN_DEF(continue,TOKEN_CONTINUE),
    {NULL},
};


#define TOKEN_BUFF_MAX_LEN      128

// 字母 下划线
#define cislalpha(c)	(isalpha(c) || (c) == '_')
// 数字 字母 下划线
#define cislalnum(c)	(isalnum(c) || (c) == '_')
// 数字
#define cisdigit(c)	(isdigit(c))
// 空白
#define cisspace(c)	(isspace(c))
// 可打印字符
#define cisprint(c)	(isprint(c))
// 16进制数字
#define cisxdigit(c)	(isxdigit(c))
// 转换为小写
#define ctolower(c)	(tolower(c))


typedef struct{
    char buff[TOKEN_BUFF_MAX_LEN];
    int used;
    int line;
    int pos;
    int token;
}token_def;


typedef struct _token_list {
    token_def token;
    struct _token_list* next;
}token_list_node_def;


typedef struct {
    token_list_node_def* head;
    token_list_node_def* current;
    int len;
}token_list_def;


typedef struct {
    int current_c;
    int current_line;
    int current_line_pos;
    token_def token_buff;
    const char *input_text;
    int input_len;
    int input_pos;
    token_list_def tlist;
}lex_def;


// 对比关键字 返回其token
int lex_compare_keywords(const char* key) {
    const keywork_item_def* ckey = g_keyword_table;
    int index = 0;
    while (ckey[index].key != NULL) {
        if (strcmp(ckey[index].key , key) == 0) {
            return ckey[index].token;
        }
        index++;
    }
    return -1;
}


// 获取下一个字符 返回0成功
int lex_get_next(lex_def *lex)
{
    if (lex->input_pos >= lex->input_len) {
        lex->current_c = 0;
        return -1;
    }
    lex->current_c=lex->input_text[lex->input_pos];
    lex->input_pos++;
    lex->current_line_pos++;
    return 0;
}


// 保存当前字符
int lex_save_char(lex_def *lex){
    token_def *t=&lex->token_buff;
    if(t->used>=TOKEN_BUFF_MAX_LEN){
        return -1;
    }
    if (t->used == 0) {
        t->pos = lex->current_line_pos;
        t->line=lex->current_line;
    }
    t->buff[t->used]=lex->current_c;
    t->used++;
    return 0;
}


// 保存一个token
int lex_save_token(lex_def* lex) {
    token_list_node_def* t = mem_calloc(1 , sizeof(token_list_node_def));
    token_list_def* l = &lex->tlist;
    memcpy(&t->token , &lex->token_buff , sizeof(token_def));
    memset(&lex->token_buff , 0 , sizeof(token_def));
    if (l->head == NULL) {
        l->head = t;
    } else {
        l->current->next = t;
        if (l->head->next == NULL) {
            l->head->next = t;
        }
    }
    l->current = t;
    l->len++;
    return 0;
}


// 删除 token list
int lex_del_token_list(lex_def* lex) {
    token_list_def* l = &lex->tlist;
    token_list_node_def* t;
    while (l->len > 0) {
        t = l->head->next;
        mem_free(l->head);
        l->head = t;
        l->len--;
    }
    l->head = NULL;
    l->current = NULL;
    return 0;
}

// 打印 token list
int lex_print_token_list(lex_def* lex) {
    token_list_def* l = &lex->tlist;
    token_list_node_def* t;
    t = l->head;
    while (t) {
        printf("\"%s\":%d[%d,%d],token=%d\n" , t->token.buff , t->token.line , t->token.pos ,
            t->token.pos + t->token.used -1 , t->token.token);
        t = t->next;
    }
    return 0;
}


int lex_analysis(const char *text,int len){
    lex_def lex = { 0 };
    int in_loop = 1;
    lex.input_text = text;
    lex.input_len = len;
    lex_get_next(&lex);
    lex.current_line_pos = 0;
    while(in_loop){
        switch (lex.current_c)
        {
        case 0: {
            in_loop = 0;
            break;
        }
        case '\r': {
            lex_get_next(&lex);
        }
        case '\n': {
            if(lex.current_c=='\n'){
                lex_get_next(&lex);
            }
            lex.current_line++;
            lex.current_line_pos=0;
            break;
        }
        case ' ':
        case '\t':
        case '\v':
        case '\f': {
            lex_get_next(&lex);
            break;
        }
        case '(':
        case ')':
        case '{':
        case '}':
        case '[':
        case ']':
        case '~':
        case ',':
        case ';':
        case ':':
        {
            lex_save_char(&lex);
            lex.token_buff.token = lex.current_c;
            lex_save_token(&lex);
            lex_get_next(&lex);
            break;
        }
        case '0':
        case '1':
        case '2':
        case '3':
        case '4':
        case '5':
        case '6':
        case '7':
        case '8':
        case '9':{
            if (cisdigit(lex.current_c)) {
                do {
                    lex_save_char(&lex);
                    lex_get_next(&lex);
                } while (cisdigit(lex.current_c));
                lex.token_buff.token = TOKEN_NUM;
                lex_save_token(&lex);
            }
            break;
        }
        case '+':{
            lex_save_char(&lex);
            lex_get_next(&lex);
            if(lex.current_c=='+'){
                lex_save_char(&lex);
                lex_get_next(&lex);
                lex.token_buff.token = TOKEN_INC;
            }else{
                lex.token_buff.token = lex.current_c;
            }
            lex_save_token(&lex);
            break;
        }
        case '-':{
            lex_save_char(&lex);
            lex_get_next(&lex);
            if(lex.current_c=='-'){
                lex_save_char(&lex);
                lex_get_next(&lex);
                lex.token_buff.token = TOKEN_DEC;
            }else{
                lex.token_buff.token = lex.current_c;
            }
            lex_save_token(&lex);
            break;
        }
        case '!':
        case '>':
        case '<':
        case '=':{
            lex_save_char(&lex);
            lex_get_next(&lex);
            if(lex.current_c=='='){
                lex_save_char(&lex);
                lex_get_next(&lex);
                lex.token_buff.token = TOKEN_EQ;
            }else{
                lex.token_buff.token = lex.current_c;
            }
            lex_save_token(&lex);
            break;
        }
        default:
            if (cislalpha(lex.current_c)) {
                do {
                    lex_save_char(&lex);
                    lex_get_next(&lex);
                } while (cislalnum(lex.current_c));
                int token = lex_compare_keywords(lex.token_buff.buff);
                if (token != -1) {
                    lex.token_buff.token = token;
                } else {
                    lex.token_buff.token = TOKEN_NAME;
                }
                lex_save_token(&lex);
            } else {
                DBG_ERR("未知的符号('%c'): 在 %d 行" , lex.current_c , lex.current_line);
                in_loop = 0;
            }
            break;
        }
    }
    lex_print_token_list(&lex);
    lex_del_token_list(&lex);
    return 0;
}