Skip to content

Commit b212f27

Browse files
committed
Add bytes to unicode converters
Signed-off-by: Konstantin Slavnov <[email protected]>
1 parent 3b0fb81 commit b212f27

9 files changed

+363
-5
lines changed

Makefile

+1-1
Original file line numberDiff line numberDiff line change
@@ -30,4 +30,4 @@ bblfsh-start:
3030
! docker ps | grep bblfshd # bblfsh server should not be running already
3131
docker run -d --name style_analyzer_bblfshd --privileged -p 9432\:9432 bblfsh/bblfshd\:v2.10.0
3232
docker exec style_analyzer_bblfshd bblfshctl driver install \
33-
javascript docker://bblfsh/javascript-driver\:v1.2.0
33+
javascript docker://bblfsh/javascript-driver\:v2.6.0

lookout/core/analyzer.py

+19
Original file line numberDiff line numberDiff line change
@@ -32,6 +32,25 @@ def to_pb(self) -> ApiReferencePointer:
3232
hash=self.commit)
3333

3434

35+
UnicodeFile = NamedTuple("UnicodeFile", (
36+
("content", str), ("uast", "bblfsh.Node"), ("path", str), ("language", str)))
37+
UnicodeFile.__doc__ = """
38+
UnicodeFile is an alternative to `lookout.core.api.service_data_pb2.File`.
39+
It is used in case `@with_unicode_files` or `@with_unicode_changes` decorators are applied.
40+
The main difference is that `content` field is a unicode string and uast offsets are changed to
41+
corresponding unicode string offsets.
42+
""".strip()
43+
44+
45+
UnicodeChange = NamedTuple("UnicodeChange", (("base", UnicodeFile), ("head", UnicodeFile)))
46+
UnicodeChange.__doc__ = """
47+
UnicodeFile is an alternative to `lookout.core.api.service_data_pb2.Change`.
48+
It is used in case `@with_unicode_changes` decorator is applied.
49+
The main difference is that `base` and `head` fields have type `UnicodeFile`. For more info read
50+
documentation about `UnicodeFile`.
51+
""".strip()
52+
53+
3554
class AnalyzerModel(Model):
3655
"""
3756
All models used in `Analyzer`-s must derive from this base class.
+126
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,126 @@
1+
from typing import Dict
2+
3+
import bblfsh
4+
from lookout.sdk.service_data_pb2 import Change, File
5+
import numpy
6+
7+
from lookout.core.analyzer import UnicodeChange, UnicodeFile
8+
9+
10+
class BytesToUnicodeConverter:
11+
"""Utility class to convert bytes positions to unicode positions in `bblfsh.Node`."""
12+
13+
def __init__(self, content: bytes):
14+
"""
15+
Initialize a new instance of BytesToUnicodeConverter.
16+
17+
:param content: Code byte representation.
18+
"""
19+
self._content = content
20+
self._content_str = content.decode(errors="replace")
21+
self._lines = self._content_str.splitlines(keepends=True)
22+
self._byte_to_str_offset = self._build_bytes_to_str_offset_mapping(content)
23+
self._lines_offset = self._build_lines_offset_mapping(self._content_str)
24+
25+
def convert_content(self):
26+
"""Convert byte content (or code) to unicode."""
27+
return self._content_str
28+
29+
def convert_uast(self, uast: bblfsh.Node) -> bblfsh.Node:
30+
"""
31+
Convert uast Nodes bytes position to unicode position.
32+
33+
UAST is expected to correspond to provided content.
34+
:param uast: corresponding UAST.
35+
:return: UAST with unicode positions.
36+
"""
37+
uast = bblfsh.Node.FromString(uast.SerializeToString())
38+
if not self._content:
39+
return uast
40+
for node in self._traverse_uast(uast):
41+
for position in (node.start_position, node.end_position):
42+
new_position = self._get_position(self._byte_to_str_offset[position.offset])
43+
for attr in ("offset", "line", "col"):
44+
setattr(position, attr, getattr(new_position, attr))
45+
return uast
46+
47+
def _get_position(self, offset: int) -> bblfsh.Position:
48+
"""Get new position for unicode string offset."""
49+
line_num = numpy.argmax(self._lines_offset > offset) - 1
50+
col = offset - self._lines_offset[line_num]
51+
line = self._lines[line_num]
52+
if len(line) == col:
53+
if line.splitlines()[0] != line:
54+
# ends with newline
55+
line_num += 1
56+
col = 0
57+
return bblfsh.Position(offset=offset, line=line_num + 1, col=col + 1)
58+
59+
@staticmethod
60+
def _build_lines_offset_mapping(content: str) -> numpy.ndarray:
61+
if not content:
62+
return numpy.empty(shape=(0, 0))
63+
line_start_offsets = [0]
64+
for d in content.splitlines(keepends=True):
65+
line_start_offsets.append(line_start_offsets[-1] + len(d))
66+
line_start_offsets[-1] += 1
67+
return numpy.array(line_start_offsets)
68+
69+
@staticmethod
70+
def _build_bytes_to_str_offset_mapping(content: bytes) -> Dict[int, int]:
71+
"""
72+
Create a dictionary with bytes offset to unicode string offset mapping.
73+
74+
:param content: Bytes object which is used to create offsets mapping.
75+
:return: Dictionary with bytes offset to unicode string offset mapping.
76+
"""
77+
byte_to_str_offset = {0: 0}
78+
byte_len_before = 0
79+
content_str = content.decode(errors="replace")
80+
for i, char in enumerate(content_str):
81+
if char != "\ufffd": # replacement character
82+
byte_len_before += len(char.encode())
83+
else:
84+
byte_len_before += 1
85+
byte_to_str_offset[byte_len_before] = i + 1
86+
byte_to_str_offset[len(content)] = len(content_str)
87+
return byte_to_str_offset
88+
89+
@staticmethod
90+
def _traverse_uast(uast: "bblfsh.Node"):
91+
stack = [uast]
92+
while stack:
93+
node = stack.pop(0)
94+
stack.extend(node.children)
95+
yield node
96+
97+
@staticmethod
98+
def convert_file(file: File) -> UnicodeFile:
99+
"""
100+
Convert lookout `File` to `UnicodeFile` with converted content and uast.
101+
102+
path and language fields are the same for result and provided `File` instance.
103+
104+
:param file: lookout File to convert.
105+
:return: New UnicodeFile instance.
106+
"""
107+
converter = BytesToUnicodeConverter(file.content)
108+
return UnicodeFile(
109+
content=converter.convert_content(),
110+
uast=converter.convert_uast(file.uast),
111+
path=file.path,
112+
language=file.language,
113+
)
114+
115+
@staticmethod
116+
def convert_change(change: Change) -> UnicodeChange:
117+
"""
118+
Convert lookout `Change` to `UnicodeChange` with converted content and uast.
119+
120+
:param change: lookout Change to convert.
121+
:return: New UnicodeChange instance.
122+
"""
123+
return Change(
124+
base=BytesToUnicodeConverter.convert_file(change.base),
125+
head=BytesToUnicodeConverter.convert_file(change.head),
126+
)

lookout/core/data_requests.py

+83
Original file line numberDiff line numberDiff line change
@@ -12,6 +12,7 @@
1212
from lookout.core.api.service_analyzer_pb2 import Comment
1313
from lookout.core.api.service_data_pb2 import Change, ChangesRequest, File, FilesRequest
1414
from lookout.core.api.service_data_pb2_grpc import DataStub
15+
from lookout.core.bytes_to_unicode_converter import BytesToUnicodeConverter
1516
from lookout.core.garbage_exclusion import GARBAGE_PATTERN
1617
from lookout.core.ports import Type
1718

@@ -88,6 +89,66 @@ def _get_channel(self) -> grpc.Channel:
8889
return channel
8990

9091

92+
class UnicodeDataService(DataService):
93+
"""Retrieves UASTs/files from the Lookout server and converts in to unicode data."""
94+
95+
def _unicodify_changes(get_changes):
96+
@functools.wraps(get_changes)
97+
def wrapped_get_changes(*args, **kwargs):
98+
return map(BytesToUnicodeConverter.convert_change, get_changes(*args, **kwargs))
99+
100+
return wrapped_get_changes
101+
102+
def _unicodify_files(get_files):
103+
@functools.wraps(get_files)
104+
def wrapped_get_files(*args, **kwargs):
105+
return map(BytesToUnicodeConverter.convert_file, get_files(*args, **kwargs))
106+
107+
return wrapped_get_files
108+
109+
def _unicodify_uast(parse):
110+
@functools.wraps(parse)
111+
def wrapped_parse(parse_request: bblfsh.aliases.ParseRequest):
112+
response = parse(parse_request)
113+
new_response = bblfsh.aliases.ParseResponse(
114+
uast=BytesToUnicodeConverter(parse_request.content.encode()).convert_uast(
115+
response.uast),
116+
errors=response.errors)
117+
return new_response
118+
119+
return wrapped_parse
120+
121+
def get_data(self) -> DataStub:
122+
"""
123+
Return a `DataStub` for the current thread.
124+
"""
125+
stub = super().get_data()
126+
stub.GetChanges = UnicodeDataService._unicodify_changes(stub.GetChanges)
127+
stub.GetFiles = UnicodeDataService._unicodify_files(stub.GetFiles)
128+
return stub
129+
130+
def get_bblfsh(self) -> bblfsh.aliases.ProtocolServiceStub:
131+
"""
132+
Return a Babelfish `ProtocolServiceStub` for the current thread.
133+
"""
134+
stub = super().get_bblfsh()
135+
stub.Parse = UnicodeDataService._unicodify_uast(stub.Parse)
136+
return stub
137+
138+
_unicodify_changes = staticmethod(_unicodify_changes)
139+
_unicodify_files = staticmethod(_unicodify_files)
140+
_unicodify_uast = staticmethod(_unicodify_uast)
141+
142+
def __str__(self):
143+
"""Summarize the UnicodeDataService instance as a string."""
144+
return "UnicodeDataService(%s)" % self._data_request_address
145+
146+
@staticmethod
147+
def from_data_service(data_service: DataService) -> "UnicodeDataService":
148+
"""Convert DataService to UnicodeDataService."""
149+
return UnicodeDataService(data_service._data_request_address)
150+
151+
91152
def _handle_rpc_errors(func):
92153
@functools.wraps(func)
93154
def wrapped_handle_rpc_errors(cls: Type[Analyzer], ptr: ReferencePointer, config: dict,
@@ -230,6 +291,28 @@ def wrapped_with_uasts_and_contents(cls: Type[Analyzer], ptr: ReferencePointer,
230291
return wrapped_with_uasts_and_contents
231292

232293

294+
def with_unicode_data_service(func): # noqa: D401
295+
"""Decorator to convert DataService to UnicodeDataService."""
296+
@functools.wraps(func)
297+
def wrapped_with_unicode_data_service(*args, **kwargs) -> AnalyzerModel:
298+
try:
299+
args = list(args)
300+
for i, arg in enumerate(args):
301+
if isinstance(arg, DataService):
302+
data_service = UnicodeDataService.from_data_service(arg)
303+
args[i] = data_service
304+
return func(*args, **kwargs)
305+
for arg_name in kwargs:
306+
if isinstance(kwargs[arg_name], DataService):
307+
data_service = UnicodeDataService.from_data_service(kwargs[arg_name])
308+
kwargs[arg_name] = data_service
309+
return func(*args, **kwargs)
310+
finally:
311+
data_service.shutdown()
312+
313+
return wrapped_with_unicode_data_service
314+
315+
233316
def request_changes(stub: DataStub, ptr_from: ReferencePointer, ptr_to: ReferencePointer,
234317
contents: bool, uast: bool) -> Iterator[Change]:
235318
"""
456 Bytes
Binary file not shown.
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,85 @@
1+
import lzma
2+
import os
3+
import unittest
4+
5+
import bblfsh
6+
from lookout.sdk.service_data_pb2 import File
7+
import numpy
8+
9+
from lookout.core.bytes_to_unicode_converter import BytesToUnicodeConverter
10+
11+
12+
def check_uast_transformation(test_case: unittest.TestCase, content: bytes,
13+
uast_byte_positions, uast_unicode_positions):
14+
for node_byte, node_uni in zip(BytesToUnicodeConverter._traverse_uast(uast_byte_positions),
15+
BytesToUnicodeConverter._traverse_uast(uast_unicode_positions)):
16+
if (node_byte.start_position != node_uni.start_position or
17+
node_byte.end_position != node_uni.end_position):
18+
test_case.assertEqual(
19+
len(content[node_byte.start_position.offset:
20+
node_byte.end_position.offset].decode(errors="replace")),
21+
node_uni.end_position.offset - node_uni.start_position.offset)
22+
23+
24+
class BytesToUnicodeConverterTests(unittest.TestCase):
25+
@classmethod
26+
def setUpClass(cls):
27+
cls.parse = bblfsh.BblfshClient("localhost:9432").parse
28+
29+
def test_build_bytes_to_str_offset_mapping(self):
30+
content = "I don't take an apéritif après-ski".encode() + \
31+
b"\x80\x80\xb3\x09\xc3\xa8\x80\x80\xc3\x80"
32+
content_str = content.decode(errors="replace")
33+
byte_to_str_offset = BytesToUnicodeConverter._build_bytes_to_str_offset_mapping(content)
34+
for offset_byte, offset_str in byte_to_str_offset.items():
35+
self.assertEqual(content[:offset_byte].decode(errors="replace"),
36+
content_str[:offset_str])
37+
self.assertEqual(content[offset_byte:].decode(errors="replace"),
38+
content_str[offset_str:])
39+
40+
def test_byte_eq_str(self):
41+
code = b"var a = 1;\nvar b = 'abc'"
42+
response = self.parse(contents=code, language="javascript", filename="test.js")
43+
uast = response.uast
44+
45+
converter = BytesToUnicodeConverter(code)
46+
uast2 = converter.convert_uast(uast)
47+
self.assertEqual(uast, uast2)
48+
49+
def test_byte_not_eq_str(self):
50+
code = b"var a = 1;\nvar b = '\xc3\x80'"
51+
response = self.parse(contents=code, language="javascript", filename="test.js")
52+
uast = response.uast
53+
54+
converter = BytesToUnicodeConverter(code)
55+
uast_uni = converter.convert_uast(uast)
56+
check_uast_transformation(self, code, uast, uast_uni)
57+
58+
def test_build_lines_offset_mapping(self):
59+
content = "1\n23\n\n456\r\n\t\t\t\n\n"
60+
res = BytesToUnicodeConverter._build_lines_offset_mapping(content)
61+
self.assertTrue((res == numpy.array([0, 2, 5, 6, 11, 15, 17])).all())
62+
63+
def test_convert_file(self):
64+
code = b"var a = 1;\nvar b = '\xc3\x80'"
65+
response = self.parse(contents=code, language="javascript", filename="test.js")
66+
uast = response.uast
67+
68+
file = File(content=code, path="test.js", language="javascript", uast=uast)
69+
unicode_file = BytesToUnicodeConverter.convert_file(file)
70+
self.assertEqual(unicode_file.content, code.decode())
71+
self.assertEqual(unicode_file.path, file.path)
72+
self.assertEqual(unicode_file.language, file.language)
73+
check_uast_transformation(self, code, uast, unicode_file.uast)
74+
75+
def test_real_file(self):
76+
filepath = os.path.join(os.path.split(__file__)[0], "test-markdown-options.js.xz")
77+
with lzma.open(filepath) as f:
78+
content = f.read()
79+
uast = self.parse(contents=content, filename=filepath, language="javascript").uast
80+
uast_uni = BytesToUnicodeConverter(content).convert_uast(uast)
81+
check_uast_transformation(self, content, uast, uast_uni)
82+
83+
84+
if __name__ == "__main__":
85+
unittest.main()

0 commit comments

Comments
 (0)