Skip to content

Commit 32461df

Browse files
committed
初步实现对utf-8文本的支持
1 parent 5d968f7 commit 32461df

File tree

10 files changed

+177
-40
lines changed

10 files changed

+177
-40
lines changed

.gitignore

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -3,3 +3,4 @@ x64/
33
cmake-build-debug/
44
*.pdb.idea/
55
cmake-build-release/
6+
cmake-build-debug-mingw/

.idea/editor.xml

Lines changed: 6 additions & 0 deletions
Some generated files are not rendered by default. Learn more about customizing how changed files appear on GitHub.

.idea/encodings.xml

Lines changed: 2 additions & 12 deletions
Some generated files are not rendered by default. Learn more about customizing how changed files appear on GitHub.

CMakeLists.txt

Lines changed: 4 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -2,7 +2,6 @@ cmake_minimum_required(VERSION 3.27)
22

33
PROJECT(JsonParser C)
44
set(CMAKE_C_STANDARD 17)
5-
65
add_library(jsonParserLib
76
core/Json.h
87
core/parser/parser.h
@@ -11,15 +10,16 @@ add_library(jsonParserLib
1110
core/types/JsonObject.h
1211
core/types/JsonString.h
1312
core/utils/outputer.h
13+
core/utils/outputer.c
1414
core/parser/parser.c
1515
core/types/JsonValue.c
1616
core/types/JsonArray.c
1717
core/types/JsonObject.c
1818
core/types/JsonString.c
19-
core/utils/outputer.c
20-
core/utils/UTF2GBK.cpp
21-
core/utils/UTF2GBK.h
19+
core/utils/utf2gbk/UTF2GBK.c
20+
core/utils/utf2gbk/UTF2GBK.h
2221
)
22+
target_link_libraries(jsonParserLib PRIVATE iconv)
2323

2424
add_executable(json
2525
cli/main.c

cli/main.c

Lines changed: 19 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -12,6 +12,10 @@ struct CommandLineArgs {
1212
FILE* output; // 输出流
1313
int compress; // 是否压缩
1414
int format; // 是否格式化
15+
int utf8Text; // 是否为utf-8 文本 是则需资源回收时删除中间文件
16+
char* convertCacheFilePath; // 为utf-8文本时转换为gbk格式时生成的临时文件
17+
char* outputFilePath; // 需要输出文件路径, 当为utf-8文本时用于转换回utf-8文本
18+
1519
};
1620

1721
// 函数声明
@@ -40,6 +44,9 @@ int main(const int argc, char* argv[]) {
4044
}
4145
if (args.output != stdout) {
4246
fclose(args.output);
47+
if(args.utf8Text) {
48+
convertGbkToUtf8(args.output);
49+
}
4350
}
4451
return 0;
4552
}
@@ -53,7 +60,8 @@ struct CommandLineArgs parseCommandLineArgs(int argc, char* argv[]) {
5360
args.output = stdout;
5461
args.compress = 0;
5562
args.format = 1;
56-
63+
args.utf8Text = 0;
64+
args.convertCacheFilePath = "";
5765
// 标记是否已经出现了--format或--compress
5866
int formatSeen = 0;
5967
int compressSeen = 0;
@@ -64,6 +72,7 @@ struct CommandLineArgs parseCommandLineArgs(int argc, char* argv[]) {
6472
if (i + 1 < argc) {
6573
printf("Output: %s\n", argv[i + 1]);
6674
args.output = fopen(argv[i + 1], "w");
75+
args.outputFilePath = argv[i+1];
6776
if (args.output == NULL) {
6877
perror("Error opening output file");
6978
exit(EXIT_FAILURE);
@@ -78,7 +87,15 @@ struct CommandLineArgs parseCommandLineArgs(int argc, char* argv[]) {
7887
else if (strcmp(argv[i], "--input") == 0 || strcmp(argv[i], "-if") == 0) {
7988
// 指定输入流
8089
if (i + 1 < argc) {
81-
args.input = fopen(argv[i + 1], "r");
90+
FILE* f = fopen(argv[i + 1], "r");
91+
args.convertCacheFilePath = "__cache.json";
92+
if(isUtf8(f)) {
93+
printf("INFO: Is UTF-8 Text\n");
94+
args.input = convertUtf8ToGbk(f,args.convertCacheFilePath);
95+
args.utf8Text = 1;
96+
}else {
97+
args.input = fopen(argv[i + 1], "r");
98+
}
8299
if (args.input == NULL) {
83100
perror("Error opening input file");
84101
exit(EXIT_FAILURE);

core/Json.h

Lines changed: 2 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -4,4 +4,5 @@
44
#include "types/JsonValue.h"
55
#include "types/JsonArray.h"
66
#include "utils/outputer.h"
7-
#include "./parser/parser.h"
7+
#include "parser/parser.h"
8+
#include "utils/utf2gbk/UTF2GBK.h"

core/utils/UTF2GBK.cpp

Lines changed: 0 additions & 5 deletions
This file was deleted.

core/utils/UTF2GBK.h

Lines changed: 0 additions & 16 deletions
This file was deleted.

core/utils/utf2gbk/UTF2GBK.c

Lines changed: 134 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,134 @@
1+
#include <stdio.h>
2+
#include <stdlib.h>
3+
#include <string.h>
4+
#include <iconv.h>
5+
6+
FILE* convertUtf8ToGbk(FILE* input, const char* outputFileName) {
7+
iconv_t cd;
8+
cd = iconv_open("GBK", "UTF-8");
9+
if (cd == (iconv_t)-1) {
10+
perror("iconv_open");
11+
exit(EXIT_FAILURE);
12+
}
13+
14+
FILE* output = fopen(outputFileName, "w");
15+
if (output == NULL) {
16+
perror("fopen");
17+
exit(EXIT_FAILURE);
18+
}
19+
20+
char inputBuffer[4096];
21+
char outputBuffer[8192];
22+
size_t bytesRead, inputBytesLeft, outputBytesLeft;
23+
24+
while ((bytesRead = fread(inputBuffer, 1, sizeof(inputBuffer), input)) > 0) {
25+
inputBytesLeft = bytesRead;
26+
char* inputPtr = inputBuffer;
27+
28+
outputBytesLeft = sizeof(outputBuffer);
29+
char* outputPtr = outputBuffer;
30+
31+
if (iconv(cd, &inputPtr, &inputBytesLeft, &outputPtr, &outputBytesLeft) == (size_t)-1) {
32+
perror("iconv");
33+
exit(EXIT_FAILURE);
34+
}
35+
36+
size_t outputBytesWritten = sizeof(outputBuffer) - outputBytesLeft;
37+
fwrite(outputBuffer, 1, outputBytesWritten, output);
38+
}
39+
40+
iconv_close(cd);
41+
fclose(input);
42+
fclose(output);
43+
44+
return fopen(outputFileName, "r");
45+
}
46+
47+
void convertGbkToUtf8(FILE* file) {
48+
// Get the size of the file
49+
fseek(file, 0, SEEK_END);
50+
long size = ftell(file);
51+
rewind(file);
52+
53+
// Read the content into a buffer
54+
char* buffer = (char*)malloc(size + 1);
55+
if (buffer == NULL) {
56+
perror("malloc");
57+
exit(EXIT_FAILURE);
58+
}
59+
60+
size_t bytesRead = fread(buffer, 1, size, file);
61+
buffer[bytesRead] = '\0'; // Null-terminate the buffer
62+
63+
// Close the original file
64+
fclose(file);
65+
66+
// Convert the content from GBK to UTF-8
67+
iconv_t cd;
68+
cd = iconv_open("UTF-8", "GBK");
69+
if (cd == (iconv_t)-1) {
70+
perror("iconv_open");
71+
exit(EXIT_FAILURE);
72+
}
73+
74+
// Create a temporary buffer for the converted content
75+
char* tempBuffer = (char*)malloc(size * 4); // UTF-8 can be up to 4 bytes per character
76+
if (tempBuffer == NULL) {
77+
perror("malloc");
78+
exit(EXIT_FAILURE);
79+
}
80+
81+
char* inputPtr = buffer;
82+
char* outputPtr = tempBuffer;
83+
size_t inputBytesLeft = bytesRead;
84+
size_t outputBytesLeft = size * 4;
85+
86+
if (iconv(cd, &inputPtr, &inputBytesLeft, &outputPtr, &outputBytesLeft) == (size_t)-1) {
87+
perror("iconv");
88+
exit(EXIT_FAILURE);
89+
}
90+
91+
// Close the iconv descriptor
92+
iconv_close(cd);
93+
94+
// Reopen the original file for writing
95+
file = fopen("converted_file.txt", "w");
96+
if (file == NULL) {
97+
perror("fopen");
98+
exit(EXIT_FAILURE);
99+
}
100+
101+
// Write the converted content back to the file
102+
fwrite(tempBuffer, 1, size * 4 - outputBytesLeft, file);
103+
104+
// Close the file and free the buffers
105+
fclose(file);
106+
free(buffer);
107+
free(tempBuffer);
108+
}
109+
110+
111+
int isUtf8(FILE* file) {
112+
rewind(file); // 将文件指针定位到文件开头
113+
114+
// 读取文件的前三个字节
115+
char bom[3];
116+
size_t bytesRead = fread(bom, 1, 3, file);
117+
118+
// 如果文件小于3个字节,返回0
119+
if (bytesRead < 3) {
120+
rewind(file);
121+
return 0;
122+
}
123+
124+
// 判断是否为UTF-8 without BOM编码
125+
if (bom[0] == (char)0xEF && bom[1] == (char)0xBB && bom[2] == (char)0xBF) {
126+
// 文件包含BOM,不是UTF-8 without BOM编码
127+
rewind(file);
128+
return 0;
129+
} else {
130+
// 文件不包含BOM,可能是UTF-8 without BOM编码
131+
rewind(file);
132+
return 1;
133+
}
134+
}

core/utils/utf2gbk/UTF2GBK.h

Lines changed: 9 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,9 @@
1+
//
2+
// Created by undefined on 2023/12/10.
3+
//
4+
#pragma once
5+
#include <stdio.h>
6+
7+
FILE* convertUtf8ToGbk(FILE* input, const char* outputFileName);
8+
void convertGbkToUtf8(FILE* input);
9+
int isUtf8(FILE* file);

0 commit comments

Comments
 (0)