Skip to content

Commit 0d09dec

Browse files
authored
Merge pull request #13 from half-pie/fix-ratio
Fix ratio && update readme
2 parents 1eb61c1 + 120f5e2 commit 0d09dec

11 files changed

+154
-32
lines changed

README.md

+50-23
Original file line numberDiff line numberDiff line change
@@ -1,4 +1,4 @@
1-
## Usage
1+
# Usage
22

33
```python
44
In [1]: from half_json.core import JSONFixer
@@ -10,37 +10,66 @@ Out[3]: FixResult(success=True, line='[{}]', origin=False)
1010

1111
In [4]: f.fix('{"a')
1212
Out[4]: FixResult(success=True, line='{"a":null}', origin=False)
13+
14+
In [5]: f.fix('{"a":}')
15+
Out[5]: FixResult(success=True, line='{"a":null}', origin=False)
1316
```
1417

1518
## 目标
1619

17-
修复残破的 json
20+
fix invalid/broken/truncated json
1821

19-
## 修复原理
22+
# 修复原理
2023

21-
1. 根据异常提示来做一些操作, json 预期啥给啥
22-
2. 根据文本前后,删除一些 BadCase
24+
1. JSONDecoderError
25+
2. line context
2326

2427
## HitRatio
2528

29+
根据 checks 里面的工具来衡量修改效果
30+
31+
ABC : autogen --> broken --> check
32+
TSR : run test.sh show.sh ratio.sh
33+
34+
### FixRatio
35+
36+
仅判断 result.success == True
37+
2638
```bash
27-
./runtest.sh
28-
# 查看准确率
29-
seq 1 10|xargs -I {} ./runtest.sh|grep ratio: |awk '{t += $3; h+= $6}{print h/t}'|tail -1
39+
./runratio.sh fix
3040
```
31-
1. 0.4269, 0.4287, 0.4303 # 实现完 12 条规则
32-
2. 0.5037, 0.5084, 0.5077 # string 的 " 补充在末尾
33-
3. 0.5259, 0.5224, 0.5187 # Array 需要 pos - 2
34-
4. 0.5433, 0.5311, 0.5381 # Array 细化一下 [, 的情况
35-
5. 0.7192, 0.7216, 0.7265 # 大改进, FIX 之前的 Bug( parser 被冲掉了)
36-
6. 0.7732, 0.7686, 0.7701 # case: {"a":1 --> 补充 }
37-
7. 0.60 , 0.58 # 去掉了空行
38-
8. 0.6971, 0.6969, 0.6984 # 增加处理 StopIteration
39-
9. 0.7428, 0.7383, 0.7427 # 增加处理 half parse
40-
10. 0.7617,0.7631, 0.7558 # 细化处理 half parse
41-
11. 0.7608,0.7612, 0.7650 # 添加从左处理
42-
12. 0.9817,0.9827, 0.9819 # 增加对字符串的处理
43-
13. 0.8314,0.8302, 0.8312 # 去掉对字符串的额外处理
41+
```
42+
1. 0.4269, 0.4287, 0.4303 # 实现完 12 条规则
43+
2. 0.5037, 0.5084, 0.5077 # string 的 " 补充在末尾
44+
3. 0.5259, 0.5224, 0.5187 # Array 需要 pos - 2
45+
4. 0.5433, 0.5311, 0.5381 # Array 细化一下 [, 的情况
46+
5. 0.7192, 0.7216, 0.7265 # 大改进, FIX 之前的 Bug( parser 被冲掉了)
47+
6. 0.7732, 0.7686, 0.7701 # case: {"a":1 --> 补充 }
48+
7. 0.60 , 0.58 # 去掉了空行
49+
8. 0.6971, 0.6969, 0.6984 # 增加处理 StopIteration
50+
9. 0.7428, 0.7383, 0.7427 # 增加处理 half parse
51+
10. 0.7617, 0.7631, 0.7558 # 细化处理 half parse
52+
11. 0.7608, 0.7612, 0.7650 # 添加从左处理
53+
12. 0.9817, 0.9827, 0.9819 # 增加对字符串的处理
54+
13. 0.8314, 0.8302, 0.8312 # 去掉对字符串的额外处理
55+
14. 0.95X # 已不可参考
56+
```
57+
58+
### Real HitRatio
59+
60+
判断 result.success == True
61+
62+
并且解析后的 json 大体和原来一致(equal && dictdiffer)
63+
64+
```bash
65+
./runratio.sh hit
66+
```
67+
```
68+
1. 0.5610, 0.5563, 0.5529 # origin
69+
2. 0.5593, 0.5532, 0.5587 # fix :} --> :null}
70+
```
71+
72+
# TODO
4473

4574
## 目前的缺点 && 发现
4675

@@ -50,8 +79,6 @@ seq 1 10|xargs -I {} ./runtest.sh|grep ratio: |awk '{t += $3; h+= $6}{print h/t}
5079
应该明确 JSONFixer 的能力范围, 对 runratio.sh 也应该比较前后两个的 json 相似程度。
5180
(听起来像无能者的辩白?)
5281

53-
## TODO
54-
5582
1. 考虑分支回溯的方式来试探
5683
2. 解析缺失的 JSON 常量
5784

checks/gen.py checks/autogen.py

File renamed without changes.

checks/broken.py

+6-2
Original file line numberDiff line numberDiff line change
@@ -6,6 +6,7 @@
66

77
def borken(s):
88
idx = random.randint(0, len(s) + 1)
9+
# TODO add count
910
return s[:idx] + s[idx + 1:]
1011

1112

@@ -23,8 +24,11 @@ def main(inflie, outfile):
2324
try:
2425
json.loads(new_line)
2526
except Exception:
26-
# only broken
27-
outf.write(new_line)
27+
out = {
28+
'origin': line,
29+
'broken': new_line
30+
}
31+
outf.write(json.dumps(out))
2832
outf.write('\n')
2933

3034
inf.close()

checks/check.py

+48
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,48 @@
1+
# coding=utf8
2+
import sys
3+
import json
4+
5+
from half_json.core import JSONFixer
6+
7+
f = JSONFixer(100)
8+
9+
10+
def json_equal(line, origin):
11+
return json.loads(line) == json.loads(origin)
12+
13+
14+
def main(inflie, outfile):
15+
inf = open(inflie, 'r')
16+
outf = open(outfile, 'w')
17+
18+
total = 0
19+
hit = 0
20+
fix = 0
21+
22+
for line in inf:
23+
info = json.loads(line)
24+
result = f.fix(info['broken'])
25+
info['fixed'] = result.success
26+
info['fix'] = result.line
27+
info['hited'] = False
28+
if info['fixed']:
29+
info['hited'] = json_equal(result.line, info['origin'])
30+
31+
outf.write(json.dumps(info))
32+
outf.write('\n')
33+
34+
if info['fixed']:
35+
fix += 1
36+
if info['hited']:
37+
hit += 1
38+
total += 1
39+
print 'total: %d fix: %d hit: %d' % (total, fix, hit)
40+
print 'fix ratio: %f' % (fix * 1.0 / total)
41+
print 'hit ratio: %f' % (hit * 1.0 / total)
42+
43+
inf.close()
44+
outf.close()
45+
46+
47+
if __name__ == '__main__':
48+
main(sys.argv[1], sys.argv[2])

checks/oneline.json

+1-1
Original file line numberDiff line numberDiff line change
@@ -1 +1 @@
1-
1-
1+
[,]

checks/runratio.sh

+6-1
Original file line numberDiff line numberDiff line change
@@ -1,2 +1,7 @@
11
#!/bin/bash
2-
seq 1 20|xargs -I {} ./runtest.sh|grep ratio: |awk '{t += $3; h+= $6}{print h/t}'|tail -1
2+
mode=$1
3+
if [ ! $mode ]; then
4+
mode=fix
5+
fi
6+
seq 1 20|xargs -P 4 -I {} ./runtest.sh {}|grep ratio:|grep $mode|awk -v mode="$mode" '{t += $3}END{printf("%s: %f \n", mode, t/NR)}'
7+
rm random.*

checks/runshow.sh

+8
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,8 @@
1+
#!/bin/bash
2+
id=$1
3+
if [ ! $id ]; then
4+
id=1
5+
fi
6+
base_name=random.$id
7+
cat $base_name.broken.uniq.fix.json|jq -r 'select((.fixed == true) and (.hited == false))|("orgin: "+."origin", "broken:"+."broken","fix: "+."fix")'
8+
# cat $base_name.broken.uniq.fix.json|jq -r 'select((.fixed == false))|("orgin: "+."origin", "broken:"+."broken","fix: "+."fix")'

checks/runtest.sh

+9-3
Original file line numberDiff line numberDiff line change
@@ -1,4 +1,10 @@
11
#!/bin/bash
2-
python gen.py > random.1.json
3-
python broken.py random.1.json random.1.broken.json
4-
jsonfixer random.1.broken.json random.1.broken.fix.json
2+
id=$1
3+
if [ ! $id ]; then
4+
id=1
5+
fi
6+
base_name=random.$id
7+
python autogen.py > $base_name.json
8+
python broken.py $base_name.json $base_name.broken.json
9+
cat $base_name.broken.json|sort|uniq > $base_name.broken.uniq.json
10+
python check.py $base_name.broken.uniq.json $base_name.broken.uniq.fix.json

half_json/core.py

+7-1
Original file line numberDiff line numberDiff line change
@@ -103,6 +103,9 @@ def patch_value_error(self, line, err_info):
103103
if lastchar == "{":
104104
return False, insert_line(line, "}", pos)
105105
return False, insert_line(line, "null}", pos)
106+
# :} --> :null}
107+
if nextchar == "}":
108+
return False, insert_line(line, "null", pos)
106109
# 08.2
107110
return False, insert_line(line, "\"", pos)
108111
# 09
@@ -112,11 +115,14 @@ def patch_value_error(self, line, err_info):
112115
return False, insert_line(line, ",", pos)
113116
# 11
114117
if error == errors.ArrayExceptObject:
115-
# fix-error
118+
# fix [, --> [
116119
if lastchar == "[" and nextchar == ",":
117120
return False, remove_line(line, pos, pos + 1)
118121
if nextchar == ",":
119122
return False, insert_line(line, "null", pos)
123+
# ,] --> ]
124+
if nextchar == "]":
125+
return False, remove_line(line, pos - 1, pos)
120126
# 11.1
121127
if nextchar == "":
122128
# quick

setup.py

+1-1
Original file line numberDiff line numberDiff line change
@@ -13,7 +13,7 @@
1313

1414
setup(
1515
name='jsonfixer',
16-
version='0.1.4b2',
16+
version='0.1.4',
1717
url='https://github.com/half-pie/half-json',
1818
description='jsonfixer: fix invalid json: broken-json / truncated-json.',
1919
long_description_content_type='text/x-rst',

tests/test_cases.py

+18
Original file line numberDiff line numberDiff line change
@@ -84,3 +84,21 @@ def test_case_miss_key(self):
8484
ok, newline, _ = JSONFixer().fix(line)
8585
self.assertTrue(ok)
8686
self.assertEqual('{"":[]}', newline)
87+
88+
def test_object_miss_value(self):
89+
line = '{"V":}'
90+
ok, newline, _ = JSONFixer().fix(line)
91+
self.assertTrue(ok)
92+
self.assertEqual('{"V":null}', newline)
93+
94+
def test_array_miss_value(self):
95+
line = '[,]'
96+
ok, newline, _ = JSONFixer().fix(line)
97+
self.assertTrue(ok)
98+
self.assertEqual('[]', newline)
99+
100+
def test_array_miss_value_2(self):
101+
line = '[null,]'
102+
ok, newline, _ = JSONFixer().fix(line)
103+
self.assertTrue(ok)
104+
self.assertEqual('[null]', newline)

0 commit comments

Comments
 (0)