Add project for calculating character width

Uses 'EastAsianWidth.txt' downloaded from Unicode to generate a lookup table.
dextercd · May 10, 2021 · 004aa9a · 004aa9a
1 parent 7543bbc
commit 004aa9a
Show file tree

Hide file tree

Showing 7 changed files with 216 additions and 0 deletions.
diff --git a/.gitignore b/.gitignore
@@ -1,2 +1,3 @@
 build/
 build-*/
+__pycache__/
diff --git a/character_width/CMakeLists.txt b/character_width/CMakeLists.txt
@@ -0,0 +1,27 @@
+if(NOT EXISTS ${CMAKE_CURRENT_BINARY_DIR}/EastAsianWidth.txt)
+    file(DOWNLOAD
+        https://www.unicode.org/Public/UCD/latest/ucd/EastAsianWidth.txt
+        ${CMAKE_CURRENT_BINARY_DIR}/EastAsianWidth.txt)
+endif()
+
+add_custom_command(
+    OUTPUT ${CMAKE_CURRENT_BINARY_DIR}/double_width_table.inc
+    COMMAND
+        python3 ${CMAKE_CURRENT_SOURCE_DIR}/generate_width_table.py
+                --input ${CMAKE_CURRENT_BINARY_DIR}/EastAsianWidth.txt
+                --output ${CMAKE_CURRENT_BINARY_DIR}/double_width_table.inc
+
+    DEPENDS
+        ${CMAKE_CURRENT_BINARY_DIR}/EastAsianWidth.txt
+        generate_width_table.py
+        parse_eaw.py
+        range_list.py
+)
+
+add_library(character_width
+    ${CMAKE_CURRENT_BINARY_DIR}/double_width_table.inc
+    src/character_width.cpp)
+
+target_include_directories(character_width
+    PRIVATE ${CMAKE_CURRENT_BINARY_DIR}
+    PUBLIC include)
diff --git a/character_width/generate_width_table.py b/character_width/generate_width_table.py
@@ -0,0 +1,40 @@
+import sys
+import argparse
+
+import parse_eaw
+import range_list
+
+
+def eaw_to_width(eaw):
+    if eaw == 'W': return 2
+    if eaw == 'F': return 2
+    if eaw == 'Na': return 1
+    if eaw == 'H': return 1
+    if eaw == 'N': return 1
+    if eaw == 'A': return 1
+    raise Exception(f'Unhandled Value {eaw}')
+
+
+parser = argparse.ArgumentParser(description='Generate double width  table')
+parser.add_argument('--input', required=True)
+parser.add_argument('--output', required=True)
+
+args = parser.parse_args()
+
+east_asian_width = parse_eaw.parse(args.input)
+width_table = range_list.transform_list(east_asian_width, eaw_to_width)
+
+with open(args.output, 'w') as output:
+    print(f'// This file was generated by {__file__}');
+    count = 0
+    for w in list(width_table):
+        if w.property == 2:
+            print(f'{{ {w.range_start:#7x}, {w.range_end:#7x} }},', end='', file=output)
+
+            count += 1
+            if count % 3 == 0:
+                print(end='\n', file=output)
+            else:
+                print(' ', end='', file=output)
+
+    print(end='\n', file=output)
diff --git a/character_width/include/cw/character_width.hpp b/character_width/include/cw/character_width.hpp
@@ -0,0 +1,10 @@
+#ifndef CW_CHARACTER_WIDTH_HPP
+#define CW_CHARACTER_WIDTH_HPP
+
+namespace cw {
+
+int character_width(char32_t);
+
+} // cw::
+
+#endif // header guard
diff --git a/character_width/parse_eaw.py b/character_width/parse_eaw.py
@@ -0,0 +1,35 @@
+import range_list
+
+
+def parse_codepoints(codepoints):
+    split = codepoints.split('..')
+    if len(split) == 2:
+        return int(split[0], base=16), int(split[1], base=16)
+
+    return int(codepoints, base=16), int(codepoints, base=16)
+
+
+def parse(file_name):
+    ranges = range_list.RangeList()
+
+    with open(file_name) as data_file:
+        for line in data_file:
+            split_result = line.split('#')
+
+            if len(split_result) == 0:
+                continue # Empty line
+
+            data = split_result[0].strip()
+            if not data:
+                continue # Only comment on this line
+
+            codepoints, property = data.split(';')
+            codepoints = codepoints.strip()
+            property = property.strip()
+
+            range_start, range_end = parse_codepoints(codepoints)
+
+            parsed_range = range_list.RangeProperty(range_start, range_end, property)
+            ranges.add(parsed_range)
+
+    return ranges
diff --git a/character_width/range_list.py b/character_width/range_list.py
@@ -0,0 +1,65 @@
+from dataclasses import dataclass, replace
+
+
+@dataclass(frozen=True)
+class RangeProperty:
+    range_start: int
+    range_end: int
+    property: any
+
+    def __post_init__(self):
+        if self.range_start > self.range_end:
+            raise Exception("Invalid range.")
+
+    def can_be_merged(self, other):
+        if self.property != other.property:
+            return False
+
+        return not(
+            self.range_end + 1 < other.range_start or
+            other.range_end + 1 < self.range_start)
+
+    def merge(self, other):
+        """Creates a range that encompasses both self and other
+
+        Doesn't check whether the ranges can be merged.
+        You can check that via can_be_merged.
+        """
+        return RangeProperty(
+                    range_start=min(self.range_start, other.range_start),
+                    range_end=max(self.range_end, other.range_end),
+                    property=self.property)
+
+
+class RangeList:
+    def __init__(self):
+        self._completed_ranges = []
+        self._in_progress_range = None
+
+    def add(self, new_range):
+        if self._in_progress_range is None:
+            self._in_progress_range = new_range
+            return
+
+        if new_range.range_start <= self._in_progress_range.range_end:
+            raise Exception('Items must be added in order.')
+
+        if self._in_progress_range.can_be_merged(new_range):
+            self._in_progress_range = self._in_progress_range.merge(new_range)
+            return
+
+        self._completed_ranges.append(self._in_progress_range)
+        self._in_progress_range = new_range
+
+    def __iter__(self):
+        yield from self._completed_ranges
+        yield self._in_progress_range
+
+
+def transform_list(range_list, property_mapping):
+    new_list = RangeList()
+    for range in range_list:
+        new_range = replace(range, property=property_mapping(range.property))
+        new_list.add(new_range)
+
+    return new_list
diff --git a/character_width/src/character_width.cpp b/character_width/src/character_width.cpp
@@ -0,0 +1,38 @@
+#include <iterator>
+#include <algorithm>
+
+namespace cw {
+
+namespace {
+
+struct code_point_range {
+    char32_t begin;
+    char32_t end;
+};
+
+code_point_range const double_width_ranges[] {
+
+#include <double_width_table.inc>
+
+};
+
+} // anonymous namespace
+
+int character_width(char32_t code)
+{
+    auto const it = std::lower_bound(
+        std::begin(double_width_ranges),
+        std::end(double_width_ranges),
+        code,
+        [](auto const range, auto const code) { return range.end < code; });
+
+    if (it == std::end(double_width_ranges))
+        return 1;
+
+    if (code >= it->begin)
+        return 2;
+
+    return 1;
+}
+
+} // cw::