src-d
diff --git a/‎Makefile
+1-1 b/‎Makefile
+1-1
diff --git a/‎lookout/core/analyzer.py
+19 b/‎lookout/core/analyzer.py
+19
diff --git a/‎lookout/core/bytes_to_unicode_converter.py
+126 b/‎lookout/core/bytes_to_unicode_converter.py
+126
diff --git a/‎lookout/core/data_requests.py
+83 b/‎lookout/core/data_requests.py
+83
diff --git a/‎lookout/core/tests/test-markdown-options.js.xz
456 Bytes b/‎lookout/core/tests/test-markdown-options.js.xz
456 Bytes
diff --git a/‎lookout/core/tests/test_bytes_to_unicode_converter.py
+85 b/‎lookout/core/tests/test_bytes_to_unicode_converter.py
+85
@@ -30,4 +30,4 @@ bblfsh-start:
 	! docker ps | grep bblfshd # bblfsh server should not be running already
 	docker run -d --name style_analyzer_bblfshd --privileged -p 9432\:9432 bblfsh/bblfshd\:v2.10.0
 	docker exec style_analyzer_bblfshd bblfshctl driver install \
-		javascript docker://bblfsh/javascript-driver\:v1.2.0
+		javascript docker://bblfsh/javascript-driver\:v2.6.0
@@ -32,6 +32,25 @@ def to_pb(self) -> ApiReferencePointer:
                                    hash=self.commit)
 
 
+UnicodeFile = NamedTuple("UnicodeFile", (
+    ("content", str), ("uast", "bblfsh.Node"), ("path", str), ("language", str)))
+UnicodeFile.__doc__ = """
+UnicodeFile is an alternative to `lookout.core.api.service_data_pb2.File`.
+It is used in case `@with_unicode_files` or `@with_unicode_changes` decorators are applied.
+The main difference is that `content` field is a unicode string and uast offsets are changed to
+corresponding unicode string offsets.
+""".strip()
+
+
+UnicodeChange = NamedTuple("UnicodeChange", (("base", UnicodeFile), ("head", UnicodeFile)))
+UnicodeChange.__doc__ = """
+UnicodeFile is an alternative to `lookout.core.api.service_data_pb2.Change`.
+It is used in case `@with_unicode_changes` decorator is applied.
+The main difference is that `base` and `head` fields have type `UnicodeFile`. For more info read
+documentation about `UnicodeFile`.
+""".strip()
+
+
 class AnalyzerModel(Model):
     """
     All models used in `Analyzer`-s must derive from this base class.
 
@@ -0,0 +1,126 @@
+from typing import Dict
+
+import bblfsh
+from lookout.sdk.service_data_pb2 import Change, File
+import numpy
+
+from lookout.core.analyzer import UnicodeChange, UnicodeFile
+
+
+class BytesToUnicodeConverter:
+    """Utility class to convert bytes positions to unicode positions in `bblfsh.Node`."""
+
+    def __init__(self, content: bytes):
+        """
+        Initialize a new instance of BytesToUnicodeConverter.
+
+        :param content: Code byte representation.
+        """
+        self._content = content
+        self._content_str = content.decode(errors="replace")
+        self._lines = self._content_str.splitlines(keepends=True)
+        self._byte_to_str_offset = self._build_bytes_to_str_offset_mapping(content)
+        self._lines_offset = self._build_lines_offset_mapping(self._content_str)
+
+    def convert_content(self):
+        """Convert byte content (or code) to unicode."""
+        return self._content_str
+
+    def convert_uast(self, uast: bblfsh.Node) -> bblfsh.Node:
+        """
+        Convert uast Nodes bytes position to unicode position.
+
+        UAST is expected to correspond to provided content.
+        :param uast: corresponding UAST.
+        :return: UAST with unicode positions.
+        """
+        uast = bblfsh.Node.FromString(uast.SerializeToString())
+        if not self._content:
+            return uast
+        for node in self._traverse_uast(uast):
+            for position in (node.start_position, node.end_position):
+                new_position = self._get_position(self._byte_to_str_offset[position.offset])
+                for attr in ("offset", "line", "col"):
+                    setattr(position, attr, getattr(new_position, attr))
+        return uast
+
+    def _get_position(self, offset: int) -> bblfsh.Position:
+        """Get new position for unicode string offset."""
+        line_num = numpy.argmax(self._lines_offset > offset) - 1
+        col = offset - self._lines_offset[line_num]
+        line = self._lines[line_num]
+        if len(line) == col:
+            if line.splitlines()[0] != line:
+                # ends with newline
+                line_num += 1
+                col = 0
+        return bblfsh.Position(offset=offset, line=line_num + 1, col=col + 1)
+
+    @staticmethod
+    def _build_lines_offset_mapping(content: str) -> numpy.ndarray:
+        if not content:
+            return numpy.empty(shape=(0, 0))
+        line_start_offsets = [0]
+        for d in content.splitlines(keepends=True):
+            line_start_offsets.append(line_start_offsets[-1] + len(d))
+        line_start_offsets[-1] += 1
+        return numpy.array(line_start_offsets)
+
+    @staticmethod
+    def _build_bytes_to_str_offset_mapping(content: bytes) -> Dict[int, int]:
+        """
+        Create a dictionary with bytes offset to unicode string offset mapping.
+
+        :param content: Bytes object which is used to create offsets mapping.
+        :return: Dictionary with bytes offset to unicode string offset mapping.
+        """
+        byte_to_str_offset = {0: 0}
+        byte_len_before = 0
+        content_str = content.decode(errors="replace")
+        for i, char in enumerate(content_str):
+            if char != "\ufffd":  # replacement character
+                byte_len_before += len(char.encode())
+            else:
+                byte_len_before += 1
+            byte_to_str_offset[byte_len_before] = i + 1
+        byte_to_str_offset[len(content)] = len(content_str)
+        return byte_to_str_offset
+
+    @staticmethod
+    def _traverse_uast(uast: "bblfsh.Node"):
+        stack = [uast]
+        while stack:
+            node = stack.pop(0)
+            stack.extend(node.children)
+            yield node
+
+    @staticmethod
+    def convert_file(file: File) -> UnicodeFile:
+        """
+        Convert lookout `File` to `UnicodeFile` with converted content and uast.
+
+        path and language fields are the same for result and provided `File` instance.
+
+        :param file: lookout File to convert.
+        :return: New UnicodeFile instance.
+        """
+        converter = BytesToUnicodeConverter(file.content)
+        return UnicodeFile(
+            content=converter.convert_content(),
+            uast=converter.convert_uast(file.uast),
+            path=file.path,
+            language=file.language,
+        )
+
+    @staticmethod
+    def convert_change(change: Change) -> UnicodeChange:
+        """
+        Convert lookout `Change` to `UnicodeChange` with converted content and uast.
+
+        :param change: lookout Change to convert.
+        :return: New UnicodeChange instance.
+        """
+        return Change(
+            base=BytesToUnicodeConverter.convert_file(change.base),
+            head=BytesToUnicodeConverter.convert_file(change.head),
+        )
@@ -12,6 +12,7 @@
 from lookout.core.api.service_analyzer_pb2 import Comment
 from lookout.core.api.service_data_pb2 import Change, ChangesRequest, File, FilesRequest
 from lookout.core.api.service_data_pb2_grpc import DataStub
+from lookout.core.bytes_to_unicode_converter import BytesToUnicodeConverter
 from lookout.core.garbage_exclusion import GARBAGE_PATTERN
 from lookout.core.ports import Type
 
@@ -88,6 +89,66 @@ def _get_channel(self) -> grpc.Channel:
         return channel
 
 
+class UnicodeDataService(DataService):
+    """Retrieves UASTs/files from the Lookout server and converts in to unicode data."""
+
+    def _unicodify_changes(get_changes):
+        @functools.wraps(get_changes)
+        def wrapped_get_changes(*args, **kwargs):
+            return map(BytesToUnicodeConverter.convert_change, get_changes(*args, **kwargs))
+
+        return wrapped_get_changes
+
+    def _unicodify_files(get_files):
+        @functools.wraps(get_files)
+        def wrapped_get_files(*args, **kwargs):
+            return map(BytesToUnicodeConverter.convert_file, get_files(*args, **kwargs))
+
+        return wrapped_get_files
+
+    def _unicodify_uast(parse):
+        @functools.wraps(parse)
+        def wrapped_parse(parse_request: bblfsh.aliases.ParseRequest):
+            response = parse(parse_request)
+            new_response = bblfsh.aliases.ParseResponse(
+                uast=BytesToUnicodeConverter(parse_request.content.encode()).convert_uast(
+                    response.uast),
+                errors=response.errors)
+            return new_response
+
+        return wrapped_parse
+
+    def get_data(self) -> DataStub:
+        """
+        Return a `DataStub` for the current thread.
+        """
+        stub = super().get_data()
+        stub.GetChanges = UnicodeDataService._unicodify_changes(stub.GetChanges)
+        stub.GetFiles = UnicodeDataService._unicodify_files(stub.GetFiles)
+        return stub
+
+    def get_bblfsh(self) -> bblfsh.aliases.ProtocolServiceStub:
+        """
+        Return a Babelfish `ProtocolServiceStub` for the current thread.
+        """
+        stub = super().get_bblfsh()
+        stub.Parse = UnicodeDataService._unicodify_uast(stub.Parse)
+        return stub
+
+    _unicodify_changes = staticmethod(_unicodify_changes)
+    _unicodify_files = staticmethod(_unicodify_files)
+    _unicodify_uast = staticmethod(_unicodify_uast)
+
+    def __str__(self):
+        """Summarize the UnicodeDataService instance as a string."""
+        return "UnicodeDataService(%s)" % self._data_request_address
+
+    @staticmethod
+    def from_data_service(data_service: DataService) -> "UnicodeDataService":
+        """Convert DataService to UnicodeDataService."""
+        return UnicodeDataService(data_service._data_request_address)
+
+
 def _handle_rpc_errors(func):
     @functools.wraps(func)
     def wrapped_handle_rpc_errors(cls: Type[Analyzer], ptr: ReferencePointer, config: dict,
@@ -230,6 +291,28 @@ def wrapped_with_uasts_and_contents(cls: Type[Analyzer], ptr: ReferencePointer,
     return wrapped_with_uasts_and_contents
 
 
+def with_unicode_data_service(func):  # noqa: D401
+    """Decorator to convert DataService to UnicodeDataService."""
+    @functools.wraps(func)
+    def wrapped_with_unicode_data_service(*args, **kwargs) -> AnalyzerModel:
+        try:
+            args = list(args)
+            for i, arg in enumerate(args):
+                if isinstance(arg, DataService):
+                    data_service = UnicodeDataService.from_data_service(arg)
+                    args[i] = data_service
+                    return func(*args, **kwargs)
+            for arg_name in kwargs:
+                if isinstance(kwargs[arg_name], DataService):
+                    data_service = UnicodeDataService.from_data_service(kwargs[arg_name])
+                    kwargs[arg_name] = data_service
+                    return func(*args, **kwargs)
+        finally:
+            data_service.shutdown()
+
+    return wrapped_with_unicode_data_service
+
+
 def request_changes(stub: DataStub, ptr_from: ReferencePointer, ptr_to: ReferencePointer,
                     contents: bool, uast: bool) -> Iterator[Change]:
     """
 
@@ -0,0 +1,85 @@
+import lzma
+import os
+import unittest
+
+import bblfsh
+from lookout.sdk.service_data_pb2 import File
+import numpy
+
+from lookout.core.bytes_to_unicode_converter import BytesToUnicodeConverter
+
+
+def check_uast_transformation(test_case: unittest.TestCase, content: bytes,
+                              uast_byte_positions, uast_unicode_positions):
+    for node_byte, node_uni in zip(BytesToUnicodeConverter._traverse_uast(uast_byte_positions),
+                                   BytesToUnicodeConverter._traverse_uast(uast_unicode_positions)):
+        if (node_byte.start_position != node_uni.start_position or
+                node_byte.end_position != node_uni.end_position):
+            test_case.assertEqual(
+                len(content[node_byte.start_position.offset:
+                            node_byte.end_position.offset].decode(errors="replace")),
+                node_uni.end_position.offset - node_uni.start_position.offset)
+
+
+class BytesToUnicodeConverterTests(unittest.TestCase):
+    @classmethod
+    def setUpClass(cls):
+        cls.parse = bblfsh.BblfshClient("localhost:9432").parse
+
+    def test_build_bytes_to_str_offset_mapping(self):
+        content = "I don't take an apéritif après-ski".encode() + \
+                  b"\x80\x80\xb3\x09\xc3\xa8\x80\x80\xc3\x80"
+        content_str = content.decode(errors="replace")
+        byte_to_str_offset = BytesToUnicodeConverter._build_bytes_to_str_offset_mapping(content)
+        for offset_byte, offset_str in byte_to_str_offset.items():
+            self.assertEqual(content[:offset_byte].decode(errors="replace"),
+                             content_str[:offset_str])
+            self.assertEqual(content[offset_byte:].decode(errors="replace"),
+                             content_str[offset_str:])
+
+    def test_byte_eq_str(self):
+        code = b"var a = 1;\nvar b = 'abc'"
+        response = self.parse(contents=code, language="javascript", filename="test.js")
+        uast = response.uast
+
+        converter = BytesToUnicodeConverter(code)
+        uast2 = converter.convert_uast(uast)
+        self.assertEqual(uast, uast2)
+
+    def test_byte_not_eq_str(self):
+        code = b"var a = 1;\nvar b = '\xc3\x80'"
+        response = self.parse(contents=code, language="javascript", filename="test.js")
+        uast = response.uast
+
+        converter = BytesToUnicodeConverter(code)
+        uast_uni = converter.convert_uast(uast)
+        check_uast_transformation(self, code, uast, uast_uni)
+
+    def test_build_lines_offset_mapping(self):
+        content = "1\n23\n\n456\r\n\t\t\t\n\n"
+        res = BytesToUnicodeConverter._build_lines_offset_mapping(content)
+        self.assertTrue((res == numpy.array([0,  2,  5,  6, 11, 15, 17])).all())
+
+    def test_convert_file(self):
+        code = b"var a = 1;\nvar b = '\xc3\x80'"
+        response = self.parse(contents=code, language="javascript", filename="test.js")
+        uast = response.uast
+
+        file = File(content=code, path="test.js", language="javascript", uast=uast)
+        unicode_file = BytesToUnicodeConverter.convert_file(file)
+        self.assertEqual(unicode_file.content, code.decode())
+        self.assertEqual(unicode_file.path, file.path)
+        self.assertEqual(unicode_file.language, file.language)
+        check_uast_transformation(self, code, uast, unicode_file.uast)
+
+    def test_real_file(self):
+        filepath = os.path.join(os.path.split(__file__)[0], "test-markdown-options.js.xz")
+        with lzma.open(filepath) as f:
+            content = f.read()
+        uast = self.parse(contents=content, filename=filepath, language="javascript").uast
+        uast_uni = BytesToUnicodeConverter(content).convert_uast(uast)
+        check_uast_transformation(self, content, uast, uast_uni)
+
+
+if __name__ == "__main__":
+    unittest.main()