Skip to content

Commit

Permalink
Add project for calculating character width
Browse files Browse the repository at this point in the history
Uses 'EastAsianWidth.txt' downloaded from Unicode to generate a lookup table.
  • Loading branch information
dextercd committed May 10, 2021
1 parent 7543bbc commit 004aa9a
Show file tree
Hide file tree
Showing 7 changed files with 216 additions and 0 deletions.
1 change: 1 addition & 0 deletions .gitignore
Original file line number Diff line number Diff line change
@@ -1,2 +1,3 @@
build/
build-*/
__pycache__/
27 changes: 27 additions & 0 deletions character_width/CMakeLists.txt
Original file line number Diff line number Diff line change
@@ -0,0 +1,27 @@
if(NOT EXISTS ${CMAKE_CURRENT_BINARY_DIR}/EastAsianWidth.txt)
file(DOWNLOAD
https://www.unicode.org/Public/UCD/latest/ucd/EastAsianWidth.txt
${CMAKE_CURRENT_BINARY_DIR}/EastAsianWidth.txt)
endif()

add_custom_command(
OUTPUT ${CMAKE_CURRENT_BINARY_DIR}/double_width_table.inc
COMMAND
python3 ${CMAKE_CURRENT_SOURCE_DIR}/generate_width_table.py
--input ${CMAKE_CURRENT_BINARY_DIR}/EastAsianWidth.txt
--output ${CMAKE_CURRENT_BINARY_DIR}/double_width_table.inc

DEPENDS
${CMAKE_CURRENT_BINARY_DIR}/EastAsianWidth.txt
generate_width_table.py
parse_eaw.py
range_list.py
)

add_library(character_width
${CMAKE_CURRENT_BINARY_DIR}/double_width_table.inc
src/character_width.cpp)

target_include_directories(character_width
PRIVATE ${CMAKE_CURRENT_BINARY_DIR}
PUBLIC include)
40 changes: 40 additions & 0 deletions character_width/generate_width_table.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,40 @@
import sys
import argparse

import parse_eaw
import range_list


def eaw_to_width(eaw):
if eaw == 'W': return 2
if eaw == 'F': return 2
if eaw == 'Na': return 1
if eaw == 'H': return 1
if eaw == 'N': return 1
if eaw == 'A': return 1
raise Exception(f'Unhandled Value {eaw}')


parser = argparse.ArgumentParser(description='Generate double width table')
parser.add_argument('--input', required=True)
parser.add_argument('--output', required=True)

args = parser.parse_args()

east_asian_width = parse_eaw.parse(args.input)
width_table = range_list.transform_list(east_asian_width, eaw_to_width)

with open(args.output, 'w') as output:
print(f'// This file was generated by {__file__}');
count = 0
for w in list(width_table):
if w.property == 2:
print(f'{{ {w.range_start:#7x}, {w.range_end:#7x} }},', end='', file=output)

count += 1
if count % 3 == 0:
print(end='\n', file=output)
else:
print(' ', end='', file=output)

print(end='\n', file=output)
10 changes: 10 additions & 0 deletions character_width/include/cw/character_width.hpp
Original file line number Diff line number Diff line change
@@ -0,0 +1,10 @@
#ifndef CW_CHARACTER_WIDTH_HPP
#define CW_CHARACTER_WIDTH_HPP

namespace cw {

int character_width(char32_t);

} // cw::

#endif // header guard
35 changes: 35 additions & 0 deletions character_width/parse_eaw.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,35 @@
import range_list


def parse_codepoints(codepoints):
split = codepoints.split('..')
if len(split) == 2:
return int(split[0], base=16), int(split[1], base=16)

return int(codepoints, base=16), int(codepoints, base=16)


def parse(file_name):
ranges = range_list.RangeList()

with open(file_name) as data_file:
for line in data_file:
split_result = line.split('#')

if len(split_result) == 0:
continue # Empty line

data = split_result[0].strip()
if not data:
continue # Only comment on this line

codepoints, property = data.split(';')
codepoints = codepoints.strip()
property = property.strip()

range_start, range_end = parse_codepoints(codepoints)

parsed_range = range_list.RangeProperty(range_start, range_end, property)
ranges.add(parsed_range)

return ranges
65 changes: 65 additions & 0 deletions character_width/range_list.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,65 @@
from dataclasses import dataclass, replace


@dataclass(frozen=True)
class RangeProperty:
range_start: int
range_end: int
property: any

def __post_init__(self):
if self.range_start > self.range_end:
raise Exception("Invalid range.")

def can_be_merged(self, other):
if self.property != other.property:
return False

return not(
self.range_end + 1 < other.range_start or
other.range_end + 1 < self.range_start)

def merge(self, other):
"""Creates a range that encompasses both self and other
Doesn't check whether the ranges can be merged.
You can check that via can_be_merged.
"""
return RangeProperty(
range_start=min(self.range_start, other.range_start),
range_end=max(self.range_end, other.range_end),
property=self.property)


class RangeList:
def __init__(self):
self._completed_ranges = []
self._in_progress_range = None

def add(self, new_range):
if self._in_progress_range is None:
self._in_progress_range = new_range
return

if new_range.range_start <= self._in_progress_range.range_end:
raise Exception('Items must be added in order.')

if self._in_progress_range.can_be_merged(new_range):
self._in_progress_range = self._in_progress_range.merge(new_range)
return

self._completed_ranges.append(self._in_progress_range)
self._in_progress_range = new_range

def __iter__(self):
yield from self._completed_ranges
yield self._in_progress_range


def transform_list(range_list, property_mapping):
new_list = RangeList()
for range in range_list:
new_range = replace(range, property=property_mapping(range.property))
new_list.add(new_range)

return new_list
38 changes: 38 additions & 0 deletions character_width/src/character_width.cpp
Original file line number Diff line number Diff line change
@@ -0,0 +1,38 @@
#include <iterator>
#include <algorithm>

namespace cw {

namespace {

struct code_point_range {
char32_t begin;
char32_t end;
};

code_point_range const double_width_ranges[] {

#include <double_width_table.inc>

};

} // anonymous namespace

int character_width(char32_t code)
{
auto const it = std::lower_bound(
std::begin(double_width_ranges),
std::end(double_width_ranges),
code,
[](auto const range, auto const code) { return range.end < code; });

if (it == std::end(double_width_ranges))
return 1;

if (code >= it->begin)
return 2;

return 1;
}

} // cw::

0 comments on commit 004aa9a

Please sign in to comment.