285 lines
6.9 KiB
C
285 lines
6.9 KiB
C
#include "char_encode.h"
|
||
#include "mymem.h"
|
||
#include "string.h"
|
||
#include "unigbk_table.h"
|
||
|
||
// Unicode转utf-8,返回值是下一个uni编码的指针
|
||
// 本函数只能转换汉字,即输入的汉字固定为2个字节
|
||
//
|
||
// 参数:uni_in,要转码的Unicode编码地址,高字节在前
|
||
// 参数:utf8_out,转码后输出的utf8编码存储地址的指针,在转码后地址自动向后移编
|
||
// 返回值:下一个Unicode编码地址
|
||
uint8_t *uni2utf8(const uint8_t *uni_in, uint8_t **utf8_out) {
|
||
uint16_t c = 0;
|
||
if ((uni_in) && (c = (uni_in[0] << 8) | uni_in[1], c)) {
|
||
if (c < 0x80) {
|
||
**utf8_out = c;
|
||
(*utf8_out)++;
|
||
return (uint8_t *)uni_in + 2;
|
||
} else {
|
||
(*utf8_out)[0] = 0xe0;
|
||
(*utf8_out)[0] |= uni_in[0] >> 4;
|
||
(*utf8_out)[1] = 0x80;
|
||
(*utf8_out)[1] |= ((uni_in[0] << 2) | (uni_in[1] >> 6)) & 0x3f;
|
||
(*utf8_out)[2] = 0x80;
|
||
(*utf8_out)[2] |= (uni_in[1]) & 0x3f;
|
||
(*utf8_out) += 3;
|
||
return (uint8_t *)uni_in + 2;
|
||
}
|
||
}
|
||
return 0;
|
||
}
|
||
|
||
// Unicode转utf-8,
|
||
void uni2utf8_str(uint8_t *uni_in, uint8_t *utf8_out) {
|
||
uint8_t *pt_char_uni = uni_in;
|
||
uint8_t *pt_char_utf8 = utf8_out;
|
||
|
||
// Unicode转UTF8
|
||
while (pt_char_uni) {
|
||
pt_char_uni = uni2utf8(pt_char_uni, &pt_char_utf8);
|
||
}
|
||
pt_char_utf8[0] = 0;
|
||
}
|
||
|
||
// utf-8转Unicode,返回值是下一个utf8编码的指针
|
||
// 本函数只能转换汉字,即输入的汉字固定为3个字节
|
||
//
|
||
// 参数:utf8_in,要转码的UTF8编码地址,高字节在前
|
||
// 参数:uni_out,转码后输出的uni编码存储地址的指针,在转码后地址自动向后移编
|
||
// 返回值:下一个utf8编码地址
|
||
uint8_t *utf82uni(const uint8_t *uft8_in, uint8_t **uni_out) {
|
||
if ((uft8_in) && (*uft8_in)) {
|
||
if (uft8_in[0] < 0x80) {
|
||
(*uni_out)[0] = 0;
|
||
(*uni_out)[1] = *uft8_in;
|
||
(*uni_out) += 2;
|
||
return (uint8_t *)uft8_in + 1;
|
||
} else {
|
||
(*uni_out)[0] = uft8_in[0] << 4;
|
||
(*uni_out)[0] |= (uft8_in[1] >> 2) & 0x0f;
|
||
(*uni_out)[1] = (uft8_in[1] << 6);
|
||
(*uni_out)[1] |= (uft8_in[2]) & 0x3f;
|
||
(*uni_out) += 2;
|
||
return (uint8_t *)uft8_in + 3;
|
||
}
|
||
}
|
||
return 0;
|
||
}
|
||
|
||
// UTF8转Unicode
|
||
void utf82uni_str(uint8_t *uft8_in, uint8_t *uni_out) {
|
||
uint8_t *pt_char_uni = uni_out;
|
||
uint8_t *pt_char_utf8 = uft8_in;
|
||
|
||
// UTF8转Unicode
|
||
while (pt_char_utf8) {
|
||
pt_char_utf8 = utf82uni(pt_char_utf8, &pt_char_uni);
|
||
}
|
||
pt_char_uni[0] = 0;
|
||
pt_char_uni[1] = 0;
|
||
}
|
||
|
||
static int g_unigbk_size = 0;
|
||
|
||
// GBK编码转Unicode编码
|
||
// 高字节在前
|
||
uint8_t *gbk2uni(uint8_t *gbk_in, uint8_t **uni_out) {
|
||
uint16_t t[2];
|
||
uint16_t c;
|
||
uint32_t i, li, hi;
|
||
uint16_t n;
|
||
unsigned int cout;
|
||
uint32_t gbk2uni_offset = 0;
|
||
|
||
if (gbk_in == 0 || *gbk_in == 0)
|
||
return 0;
|
||
if (*gbk_in < 0x80) {
|
||
(*uni_out)[0] = 0; // 输出高字节在前
|
||
(*uni_out)[1] = *gbk_in;
|
||
(*uni_out) += 2;
|
||
return gbk_in + 1;
|
||
} else {
|
||
c = (gbk_in[0] << 8) | gbk_in[1];
|
||
if (!g_unigbk_size) // 如果没打开UNIGBK.BIN.
|
||
{
|
||
g_unigbk_size = unigbk_open();
|
||
}
|
||
gbk2uni_offset = g_unigbk_size / 2;
|
||
if (g_unigbk_size) // 存在
|
||
{
|
||
/* Unicode to OEMCP */
|
||
hi = g_unigbk_size / 2; // 对半开.
|
||
hi = hi / 4 - 1;
|
||
li = 0;
|
||
for (n = 16; n; n--) {
|
||
i = li + (hi - li) / 2;
|
||
cout = unigbk_read(i * 4 + gbk2uni_offset, &t, 4);
|
||
if (c == t[0])
|
||
break;
|
||
if (c > t[0])
|
||
li = i;
|
||
else
|
||
hi = i;
|
||
}
|
||
c = n ? t[1] : 0;
|
||
} else
|
||
c = 0;
|
||
(*uni_out)[0] = c >> 8; // 输出高字节在前
|
||
(*uni_out)[1] = c & 0xff;
|
||
*uni_out += 2;
|
||
return gbk_in + 2;
|
||
}
|
||
// return 0;
|
||
}
|
||
|
||
// Unicode编码转GBK编码
|
||
// 高字节在前
|
||
uint8_t *uni2gbk(uint8_t *uni_in, uint8_t **gbk_out) {
|
||
uint16_t t[2];
|
||
uint16_t c;
|
||
uint32_t i, li, hi;
|
||
uint16_t n;
|
||
unsigned int cout;
|
||
uint32_t gbk2uni_offset = 0;
|
||
|
||
if (uni_in == 0 || (c = (uni_in[0] << 8) | uni_in[1], c == 0))
|
||
return 0;
|
||
if (c < 0x80) {
|
||
(*gbk_out)[0] = c;
|
||
(*gbk_out) += 1;
|
||
return uni_in + 2;
|
||
} else {
|
||
if (!g_unigbk_size) // 如果没打开UNIGBK.BIN.
|
||
{
|
||
g_unigbk_size = unigbk_open();
|
||
}
|
||
gbk2uni_offset = 0;
|
||
// 因为在编码表文件中是低字节在前,这里重新指定c
|
||
// c=(uni_in[1]<<8)|uni_in[0];
|
||
if (g_unigbk_size) // 存在
|
||
{
|
||
/* Unicode to OEMCP */
|
||
hi = g_unigbk_size / 2; // 对半开.
|
||
hi = hi / 4 - 1;
|
||
li = 0;
|
||
for (n = 16; n; n--) {
|
||
i = li + (hi - li) / 2;
|
||
cout = unigbk_read(i * 4 + gbk2uni_offset, &t, 4);
|
||
if (c == t[0])
|
||
break;
|
||
if (c > t[0])
|
||
li = i;
|
||
else
|
||
hi = i;
|
||
}
|
||
c = n ? t[1] : 0;
|
||
} else
|
||
c = 0;
|
||
(*gbk_out)[0] = c >> 8; // 输出高字节在前
|
||
(*gbk_out)[1] = c & 0xff;
|
||
*gbk_out += 2;
|
||
return uni_in + 2;
|
||
}
|
||
// return 0;
|
||
}
|
||
|
||
// GBK转Unicode
|
||
void gbk2uni_str(uint8_t *gbk_in, uint8_t *uni_out) {
|
||
uint8_t *pt_char_gbk = gbk_in;
|
||
uint8_t *pt_char_uni = uni_out;
|
||
|
||
// GBK转Unicode
|
||
while (pt_char_gbk) {
|
||
pt_char_gbk = gbk2uni(pt_char_gbk, &pt_char_uni);
|
||
}
|
||
pt_char_uni[0] = 0;
|
||
pt_char_uni[1] = 0;
|
||
}
|
||
|
||
// Unicode编码转GBK编码
|
||
// 高字节在前
|
||
void uni2gbk_str(uint8_t *uni_in, uint8_t *gbk_out) {
|
||
uint8_t *pt_char_gbk = gbk_out;
|
||
uint8_t *pt_char_uni = uni_in;
|
||
|
||
// Unicode转GBK
|
||
while (pt_char_uni) {
|
||
pt_char_uni = uni2gbk(pt_char_uni, &pt_char_gbk);
|
||
}
|
||
pt_char_gbk[0] = 0;
|
||
}
|
||
|
||
// GBK转UTF8
|
||
void gbk2utf8_str(uint8_t *gbk_in, uint8_t *utf8_out) {
|
||
int str_size = strlen((const char *)gbk_in);
|
||
uint8_t *char_uni = mymalloc(str_size * 3);
|
||
gbk2uni_str(gbk_in, char_uni);
|
||
uni2utf8_str(char_uni, utf8_out);
|
||
myfree(char_uni);
|
||
}
|
||
|
||
// UTF8转GBK
|
||
void utf82gbk_str(uint8_t *utf8_in, uint8_t *gbk_out) {
|
||
int str_size = strlen((const char *)utf8_in);
|
||
uint8_t *char_uni = mymalloc(str_size * 3);
|
||
utf82uni_str(utf8_in, char_uni);
|
||
uni2gbk_str(char_uni, gbk_out);
|
||
myfree(char_uni);
|
||
}
|
||
|
||
// 汉字编码转化测试
|
||
// 返回值
|
||
// 0,测试成功,非0失败
|
||
int char_encode_test(char *str_input) {
|
||
int str_size = strlen(str_input);
|
||
uint8_t *char_gbk = mymalloc(str_size * 3);
|
||
uint8_t *char_uni = mymalloc(str_size * 3);
|
||
uint8_t *char_utf8 = mymalloc(str_size * 3);
|
||
mymemset(char_gbk, 0, str_size * 3);
|
||
mymemset(char_uni, 0, str_size * 3);
|
||
mymemset(char_utf8, 0, str_size * 3);
|
||
uint8_t *pt_char_gbk = char_gbk;
|
||
uint8_t *pt_char_uni = char_uni;
|
||
uint8_t *pt_char_utf8 = char_utf8;
|
||
|
||
mymemcpy(char_gbk, str_input, str_size);
|
||
|
||
// GBK转Unicode
|
||
while (pt_char_gbk) {
|
||
pt_char_gbk = gbk2uni(pt_char_gbk, &pt_char_uni);
|
||
}
|
||
mymemset(char_gbk, 0, str_size * 3);
|
||
pt_char_gbk = char_gbk;
|
||
pt_char_uni = char_uni;
|
||
pt_char_utf8 = char_utf8;
|
||
|
||
// Unicode转UTF8
|
||
while (pt_char_uni) {
|
||
pt_char_uni = uni2utf8(pt_char_uni, &pt_char_utf8);
|
||
}
|
||
mymemset(char_uni, 0, str_size * 3);
|
||
pt_char_gbk = char_gbk;
|
||
pt_char_uni = char_uni;
|
||
pt_char_utf8 = char_utf8;
|
||
|
||
// UTF8转Unicode
|
||
while (pt_char_utf8) {
|
||
pt_char_utf8 = utf82uni(pt_char_utf8, &pt_char_uni);
|
||
}
|
||
pt_char_gbk = char_gbk;
|
||
pt_char_uni = char_uni;
|
||
pt_char_utf8 = char_utf8;
|
||
|
||
// Unicode转GBK
|
||
while (pt_char_uni) {
|
||
pt_char_uni = uni2gbk(pt_char_uni, &pt_char_gbk);
|
||
}
|
||
|
||
int ret = strcmp((const char *)char_gbk, str_input);
|
||
myfree(char_gbk);
|
||
myfree(char_uni);
|
||
myfree(char_utf8);
|
||
return ret;
|
||
}
|