Skip to content

Commit d4aa952

Browse files
authored
Try to fix #255, #242 (#248)
* WIP * Make debugging easy for fix encoding bugs * Fix encoding problem that is #225 #242 * More simple implementation for bytes compatible * Make more simple * Remove debugging code * It is a classmethod, not instance method * Add a test case for suddn EOF * Rename to the correct name * Care multiple scriptencoding * Fix a problem about debug_hint overwriting * Care single line scriptencoding * decoding error is not a RuntimeError but Exception * More debug_hint * Fix a problem about missing last char * Change Chardet priority * Revert "WIP" This reverts commit 1fb7dfc. * Split files * Try to resolve module name conflict * Cosmetic changes * Compose strategies to decoding_strategy
1 parent 79b0f1a commit d4aa952

18 files changed

+302
-29
lines changed

dev_tool/show_ast.py

+2-2
Original file line numberDiff line numberDiff line change
@@ -1,12 +1,12 @@
1-
#!/usr/bin/env python
1+
#!/usr/bin/env python3
22

33
import sys
44
from argparse import ArgumentParser
55
from pathlib import Path
66
from pprint import pprint
77

88
vint_root = Path(__file__).resolve().parent.parent
9-
sys.path.append(str(vint_root))
9+
sys.path.insert(0, str(vint_root))
1010

1111
from vint.ast.node_type import NodeType
1212
from vint.ast.traversing import traverse

dev_tool/show_chardet_result.py

+24
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,24 @@
1+
#!/usr/bin/env python3
2+
3+
import chardet
4+
import sys
5+
from pprint import pprint
6+
from pathlib import Path
7+
from argparse import ArgumentParser
8+
9+
10+
def main(file_path):
11+
# type: (Path) -> None
12+
with file_path.open(mode='rb') as f:
13+
bytes_seq = f.read()
14+
15+
coding_hint = chardet.detect(bytes_seq)
16+
pprint(coding_hint)
17+
18+
19+
if __name__ == '__main__':
20+
arg_parser = ArgumentParser(prog='show_ast', description='Show AST')
21+
arg_parser.add_argument('file', nargs=1, help='File to parse')
22+
namespace = vars(arg_parser.parse_args(sys.argv[1:]))
23+
24+
main(Path(namespace['file'][0]))

dev_tool/show_encoding.py

+23
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,23 @@
1+
#!/usr/bin/env python3
2+
3+
import sys
4+
from argparse import ArgumentParser
5+
from pathlib import Path
6+
from pprint import pprint
7+
8+
vint_root = Path(__file__).resolve().parent.parent
9+
sys.path.insert(0, str(vint_root))
10+
11+
from vint.encodings.decoder import Decoder
12+
from vint.encodings.decoding_strategy import default_decoding_strategy
13+
14+
15+
if __name__ == '__main__':
16+
arg_parser = ArgumentParser(prog='show_ast', description='Show AST')
17+
arg_parser.add_argument('file', nargs=1, help='File to detect encoding')
18+
namespace = vars(arg_parser.parse_args(sys.argv[1:]))
19+
20+
file_path = Path(namespace['file'][0])
21+
decoder = Decoder(default_decoding_strategy)
22+
decoder.read(file_path)
23+
pprint(decoder.debug_hint)

test/fixture/encodings/__init__.py

Whitespace-only changes.

test/fixture/encodings/ascii.vim

+1
Original file line numberDiff line numberDiff line change
@@ -0,0 +1 @@
1+
echo "Only ASCII"

test/fixture/encodings/cp932.vim

+1
Original file line numberDiff line numberDiff line change
@@ -0,0 +1 @@
1+
scriptencoding cp932 "���{��

test/fixture/encodings/empty.vim

Whitespace-only changes.

test/fixture/encodings/issue-225.vim

+3
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,3 @@
1+
scriptencoding utf-8
2+
" :purple_heart: 💜
3+
" set list listchars=tab:»·,trail:·,eol:¬,nbsp:_,extends:❯,precedes:❮
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,5 @@
1+
echo "before 1"
2+
scriptencoding utf8
3+
echo "before 2"
4+
scriptencoding utf8
5+
echo "after 2"
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,3 @@
1+
echo "no scriptencofing 1"
2+
echo "no scriptencofing 2"
3+
echo "no scriptencofing 3"
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,3 @@
1+
echo "no scriptencoding 1"
2+
echo "no scriptencoding 2"
3+
echo "no scriptencoding 3"
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,3 @@
1+
echo "before"
2+
scriptencoding utf8
3+
echo "after"

test/fixture/encodings/sudden_eof.vim

+1
Original file line numberDiff line numberDiff line change
@@ -0,0 +1 @@
1+
scriptencoding

vint/ast/parsing.py

+10-26
Original file line numberDiff line numberDiff line change
@@ -1,17 +1,9 @@
1-
import chardet
21
import re
32
from vint._bundles import vimlparser
43
from vint.ast.traversing import traverse
5-
6-
7-
class EncodingDetectionError(Exception):
8-
def __init__(self, file_path):
9-
self.file_path = file_path
10-
11-
12-
def __str__(self):
13-
return 'Cannot detect encoding (binary file?): {file_path}'.format(
14-
file_path=str(self.file_path))
4+
from vint.encodings.decoder import Decoder
5+
from vint.encodings.decoding_strategy import default_decoding_strategy
6+
from pprint import pprint
157

168

179
class Parser(object):
@@ -42,23 +34,15 @@ def parse(self, string):
4234

4335
def parse_file(self, file_path):
4436
""" Parse vim script file and return the AST. """
45-
with file_path.open(mode='rb') as f:
46-
bytes_seq = f.read()
47-
48-
is_empty = len(bytes_seq) == 0
49-
if is_empty:
50-
return self.parse('')
51-
52-
encoding_hint = chardet.detect(bytes_seq)
53-
encoding = encoding_hint['encoding']
54-
if not encoding:
55-
# Falsey means we cannot detect the encoding of the file.
56-
raise EncodingDetectionError(file_path)
57-
58-
decoded = bytes_seq.decode(encoding)
59-
decoded_and_lf_normalized = decoded.replace('\r\n', '\n')
37+
decoder = Decoder(default_decoding_strategy)
38+
decoded = decoder.read(file_path)
39+
decoded_and_lf_normalized = decoded.replace('\r\n', '\n')
6040

41+
try:
6142
return self.parse(decoded_and_lf_normalized)
43+
except vimlparser.VimLParserException:
44+
pprint(decoder.debug_hint)
45+
raise
6246

6347

6448
def parse_redir(self, redir_cmd):

vint/encodings/__init__.py

Whitespace-only changes.

vint/encodings/decoder.py

+74
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,74 @@
1+
import sys
2+
from typing import Dict, Any
3+
from pprint import pformat
4+
from pathlib import Path
5+
from vint.encodings.decoding_strategy import DecodingStrategy
6+
7+
8+
SCRIPTENCODING_PREFIX = bytearray('scriptencoding', encoding='ascii')
9+
10+
11+
12+
class Decoder(object):
13+
def __init__(self, strategy):
14+
# type: (DecodingStrategy) -> None
15+
self.strategy = strategy
16+
self.debug_hint = dict(version=sys.version)
17+
18+
19+
def read(self, file_path):
20+
# type: (Path) -> str
21+
22+
with file_path.open(mode='rb') as f:
23+
bytes_seq = f.read()
24+
strings = []
25+
26+
for (loc, hunk) in _split_by_scriptencoding(bytes_seq):
27+
debug_hint_for_the_loc = dict()
28+
self.debug_hint[loc] = debug_hint_for_the_loc
29+
30+
string = self.strategy.decode(hunk, debug_hint=debug_hint_for_the_loc)
31+
32+
if string is None:
33+
raise EncodingDetectionError(self.debug_hint)
34+
35+
strings.append(string)
36+
37+
return ''.join(strings)
38+
39+
40+
def _split_by_scriptencoding(bytes_seq):
41+
# type: (bytes) -> [(str, bytes)]
42+
max_end_index = len(bytes_seq)
43+
start_index = 0
44+
bytes_seq_and_loc_list = []
45+
46+
while True:
47+
end_index = bytes_seq.find(SCRIPTENCODING_PREFIX, start_index + 1)
48+
49+
if end_index < 0:
50+
end_index = max_end_index
51+
52+
bytes_seq_and_loc_list.append((
53+
"{start_index}:{end_index}".format(start_index=start_index, end_index=end_index),
54+
bytes_seq[start_index:end_index]
55+
))
56+
57+
if end_index < max_end_index:
58+
start_index = end_index
59+
continue
60+
61+
return bytes_seq_and_loc_list
62+
63+
64+
class EncodingDetectionError(Exception):
65+
def __init__(self, debug_hint):
66+
# type: (Dict[str, Any]) -> None
67+
self.debug_hint = debug_hint
68+
69+
70+
def __str__(self):
71+
# type: () -> str
72+
return 'Cannot detect encoding (binary file?): {debug_hint}'.format(
73+
debug_hint=pformat(self.debug_hint)
74+
)

vint/encodings/decoding_strategy.py

+147
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,147 @@
1+
import chardet
2+
from typing import Optional, Dict, Any
3+
4+
5+
SCRIPTENCODING_PREFIX = bytearray('scriptencoding', encoding='ascii')
6+
COMMENT_START_TOKEN = bytearray('"', encoding='ascii')
7+
LF = bytearray("\n", encoding='ascii')
8+
9+
10+
class DecodingStrategy(object):
11+
def decode(self, bytes_seq, debug_hint):
12+
# type: (bytes, Dict[str, str]) -> Optional[str]
13+
raise NotImplementedError
14+
15+
16+
class DecodingStrategyByChardet(DecodingStrategy):
17+
def decode(self, bytes_seq, debug_hint):
18+
# type: (bytes, Dict[str, Any]) -> Optional[str]
19+
encoding_hint = chardet.detect(bytearray(bytes_seq))
20+
encoding = encoding_hint['encoding']
21+
22+
debug_hint['chardet_encoding'] = encoding_hint['encoding']
23+
debug_hint['chardet_confidence'] = encoding_hint['confidence']
24+
25+
try:
26+
return bytes_seq.decode(encoding)
27+
28+
except Exception as e:
29+
debug_hint['chardet_error'] = str(e)
30+
return None
31+
32+
33+
class ComposedDecodingStrategy(DecodingStrategy):
34+
def __init__(self, strategies):
35+
# type: ([DecodingStrategy]) -> None
36+
self.strategies = strategies
37+
38+
39+
def decode(self, bytes_seq, debug_hint):
40+
# type: (bytes, Dict[str, Any]) -> Optional[str]
41+
42+
debug_hint['composed_strategies'] = [type(strategy).__name__ for strategy in self.strategies]
43+
44+
for strategy in self.strategies:
45+
string_candidate = strategy.decode(bytes_seq, debug_hint)
46+
47+
if string_candidate is None:
48+
continue
49+
50+
debug_hint['selected_strategy'] = type(strategy).__name__
51+
52+
return string_candidate
53+
54+
55+
class DecodingStrategyForEmpty(DecodingStrategy):
56+
def decode(self, bytes_seq, debug_hint):
57+
# type: (bytes, Dict[str, Any]) -> Optional[str]
58+
if len(bytes_seq) <= 0:
59+
debug_hint['empty'] = 'true'
60+
return ''
61+
62+
debug_hint['empty'] = 'false'
63+
return None
64+
65+
66+
class DecodingStrategyByScriptencoding(DecodingStrategy):
67+
def decode(self, bytes_seq, debug_hint):
68+
# type: (bytes, Dict[str, Any]) -> Optional[str]
69+
encoding_part = DecodingStrategyByScriptencoding.parse_script_encoding(bytes_seq, debug_hint)
70+
71+
if encoding_part is None:
72+
debug_hint['scriptencoding'] = 'None'
73+
return None
74+
75+
try:
76+
debug_hint['scriptencoding'] = encoding_part
77+
return bytes_seq.decode(encoding=encoding_part.decode(encoding='ascii'))
78+
79+
except LookupError as e:
80+
debug_hint['scriptencoding_error'] = str(e)
81+
return None
82+
83+
84+
@classmethod
85+
def parse_script_encoding(cls, bytes_seq, debug_hint):
86+
# type: (bytes, Dict[str, Any]) -> Optional[bytes]
87+
try:
88+
start_index = bytes_seq.index(SCRIPTENCODING_PREFIX)
89+
encoding_part_start_index = start_index + len(SCRIPTENCODING_PREFIX)
90+
91+
try:
92+
encoding_part_end_index_candidate_by_line_break = bytes_seq.index(LF, encoding_part_start_index)
93+
94+
try:
95+
encoding_part_end_index_candidate_by_comment = bytes_seq.index(
96+
COMMENT_START_TOKEN, encoding_part_start_index)
97+
98+
# Case for :scriptencoding foo "foo\n
99+
encoding_part_end_index = min(
100+
encoding_part_end_index_candidate_by_line_break,
101+
encoding_part_end_index_candidate_by_comment
102+
)
103+
104+
except ValueError:
105+
# Case for :scriptencoding foo\n
106+
encoding_part_end_index = encoding_part_end_index_candidate_by_line_break
107+
108+
except ValueError:
109+
try:
110+
# Case for :scriptencoding foo "foo<EOF>
111+
encoding_part_end_index_candidate_by_comment = bytes_seq.index(
112+
COMMENT_START_TOKEN, encoding_part_start_index)
113+
encoding_part_end_index = encoding_part_end_index_candidate_by_comment
114+
115+
except ValueError:
116+
# Case for :scriptencoding foo<EOF>
117+
encoding_part_end_index = len(bytes_seq) - 1
118+
119+
encoding_part_candidate = bytes_seq[encoding_part_start_index:encoding_part_end_index]
120+
return encoding_part_candidate.strip()
121+
122+
except ValueError:
123+
debug_hint['scriptencoding_error'] = '`scriptencoding` is not found'
124+
return None
125+
126+
127+
class DecodingStrategyForUTF8(DecodingStrategy):
128+
def decode(self, bytes_seq, debug_hint):
129+
# type: (bytes, Dict[str, Any]) -> Optional[str]
130+
try:
131+
string = bytes_seq.decode('utf8')
132+
133+
debug_hint['utf-8'] = 'success'
134+
return string
135+
136+
except Exception as e:
137+
debug_hint['utf-8'] = 'failed: {}'.format(str(e))
138+
139+
return None
140+
141+
142+
default_decoding_strategy = ComposedDecodingStrategy([
143+
DecodingStrategyForEmpty(),
144+
DecodingStrategyByScriptencoding(),
145+
DecodingStrategyForUTF8(),
146+
DecodingStrategyByChardet(),
147+
])

vint/linting/linter.py

+2-1
Original file line numberDiff line numberDiff line change
@@ -2,7 +2,8 @@
22
import logging
33
from pathlib import Path
44
from vint._bundles import vimlparser
5-
from vint.ast.parsing import Parser, EncodingDetectionError
5+
from vint.encodings.decoder import EncodingDetectionError
6+
from vint.ast.parsing import Parser
67
from vint.ast.node_type import NodeType
78
from vint.ast.traversing import traverse
89
from vint.ast.plugin.scope_plugin import ScopePlugin

0 commit comments

Comments
 (0)