-
Notifications
You must be signed in to change notification settings - Fork 0
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
Add project for calculating character width
Uses 'EastAsianWidth.txt' downloaded from Unicode to generate a lookup table.
- Loading branch information
Showing
7 changed files
with
216 additions
and
0 deletions.
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -1,2 +1,3 @@ | ||
build/ | ||
build-*/ | ||
__pycache__/ |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,27 @@ | ||
if(NOT EXISTS ${CMAKE_CURRENT_BINARY_DIR}/EastAsianWidth.txt) | ||
file(DOWNLOAD | ||
https://www.unicode.org/Public/UCD/latest/ucd/EastAsianWidth.txt | ||
${CMAKE_CURRENT_BINARY_DIR}/EastAsianWidth.txt) | ||
endif() | ||
|
||
add_custom_command( | ||
OUTPUT ${CMAKE_CURRENT_BINARY_DIR}/double_width_table.inc | ||
COMMAND | ||
python3 ${CMAKE_CURRENT_SOURCE_DIR}/generate_width_table.py | ||
--input ${CMAKE_CURRENT_BINARY_DIR}/EastAsianWidth.txt | ||
--output ${CMAKE_CURRENT_BINARY_DIR}/double_width_table.inc | ||
|
||
DEPENDS | ||
${CMAKE_CURRENT_BINARY_DIR}/EastAsianWidth.txt | ||
generate_width_table.py | ||
parse_eaw.py | ||
range_list.py | ||
) | ||
|
||
add_library(character_width | ||
${CMAKE_CURRENT_BINARY_DIR}/double_width_table.inc | ||
src/character_width.cpp) | ||
|
||
target_include_directories(character_width | ||
PRIVATE ${CMAKE_CURRENT_BINARY_DIR} | ||
PUBLIC include) |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,40 @@ | ||
import sys | ||
import argparse | ||
|
||
import parse_eaw | ||
import range_list | ||
|
||
|
||
def eaw_to_width(eaw): | ||
if eaw == 'W': return 2 | ||
if eaw == 'F': return 2 | ||
if eaw == 'Na': return 1 | ||
if eaw == 'H': return 1 | ||
if eaw == 'N': return 1 | ||
if eaw == 'A': return 1 | ||
raise Exception(f'Unhandled Value {eaw}') | ||
|
||
|
||
parser = argparse.ArgumentParser(description='Generate double width table') | ||
parser.add_argument('--input', required=True) | ||
parser.add_argument('--output', required=True) | ||
|
||
args = parser.parse_args() | ||
|
||
east_asian_width = parse_eaw.parse(args.input) | ||
width_table = range_list.transform_list(east_asian_width, eaw_to_width) | ||
|
||
with open(args.output, 'w') as output: | ||
print(f'// This file was generated by {__file__}'); | ||
count = 0 | ||
for w in list(width_table): | ||
if w.property == 2: | ||
print(f'{{ {w.range_start:#7x}, {w.range_end:#7x} }},', end='', file=output) | ||
|
||
count += 1 | ||
if count % 3 == 0: | ||
print(end='\n', file=output) | ||
else: | ||
print(' ', end='', file=output) | ||
|
||
print(end='\n', file=output) |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,10 @@ | ||
#ifndef CW_CHARACTER_WIDTH_HPP | ||
#define CW_CHARACTER_WIDTH_HPP | ||
|
||
namespace cw { | ||
|
||
int character_width(char32_t); | ||
|
||
} // cw:: | ||
|
||
#endif // header guard |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,35 @@ | ||
import range_list | ||
|
||
|
||
def parse_codepoints(codepoints): | ||
split = codepoints.split('..') | ||
if len(split) == 2: | ||
return int(split[0], base=16), int(split[1], base=16) | ||
|
||
return int(codepoints, base=16), int(codepoints, base=16) | ||
|
||
|
||
def parse(file_name): | ||
ranges = range_list.RangeList() | ||
|
||
with open(file_name) as data_file: | ||
for line in data_file: | ||
split_result = line.split('#') | ||
|
||
if len(split_result) == 0: | ||
continue # Empty line | ||
|
||
data = split_result[0].strip() | ||
if not data: | ||
continue # Only comment on this line | ||
|
||
codepoints, property = data.split(';') | ||
codepoints = codepoints.strip() | ||
property = property.strip() | ||
|
||
range_start, range_end = parse_codepoints(codepoints) | ||
|
||
parsed_range = range_list.RangeProperty(range_start, range_end, property) | ||
ranges.add(parsed_range) | ||
|
||
return ranges |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,65 @@ | ||
from dataclasses import dataclass, replace | ||
|
||
|
||
@dataclass(frozen=True) | ||
class RangeProperty: | ||
range_start: int | ||
range_end: int | ||
property: any | ||
|
||
def __post_init__(self): | ||
if self.range_start > self.range_end: | ||
raise Exception("Invalid range.") | ||
|
||
def can_be_merged(self, other): | ||
if self.property != other.property: | ||
return False | ||
|
||
return not( | ||
self.range_end + 1 < other.range_start or | ||
other.range_end + 1 < self.range_start) | ||
|
||
def merge(self, other): | ||
"""Creates a range that encompasses both self and other | ||
Doesn't check whether the ranges can be merged. | ||
You can check that via can_be_merged. | ||
""" | ||
return RangeProperty( | ||
range_start=min(self.range_start, other.range_start), | ||
range_end=max(self.range_end, other.range_end), | ||
property=self.property) | ||
|
||
|
||
class RangeList: | ||
def __init__(self): | ||
self._completed_ranges = [] | ||
self._in_progress_range = None | ||
|
||
def add(self, new_range): | ||
if self._in_progress_range is None: | ||
self._in_progress_range = new_range | ||
return | ||
|
||
if new_range.range_start <= self._in_progress_range.range_end: | ||
raise Exception('Items must be added in order.') | ||
|
||
if self._in_progress_range.can_be_merged(new_range): | ||
self._in_progress_range = self._in_progress_range.merge(new_range) | ||
return | ||
|
||
self._completed_ranges.append(self._in_progress_range) | ||
self._in_progress_range = new_range | ||
|
||
def __iter__(self): | ||
yield from self._completed_ranges | ||
yield self._in_progress_range | ||
|
||
|
||
def transform_list(range_list, property_mapping): | ||
new_list = RangeList() | ||
for range in range_list: | ||
new_range = replace(range, property=property_mapping(range.property)) | ||
new_list.add(new_range) | ||
|
||
return new_list |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,38 @@ | ||
#include <iterator> | ||
#include <algorithm> | ||
|
||
namespace cw { | ||
|
||
namespace { | ||
|
||
struct code_point_range { | ||
char32_t begin; | ||
char32_t end; | ||
}; | ||
|
||
code_point_range const double_width_ranges[] { | ||
|
||
#include <double_width_table.inc> | ||
|
||
}; | ||
|
||
} // anonymous namespace | ||
|
||
int character_width(char32_t code) | ||
{ | ||
auto const it = std::lower_bound( | ||
std::begin(double_width_ranges), | ||
std::end(double_width_ranges), | ||
code, | ||
[](auto const range, auto const code) { return range.end < code; }); | ||
|
||
if (it == std::end(double_width_ranges)) | ||
return 1; | ||
|
||
if (code >= it->begin) | ||
return 2; | ||
|
||
return 1; | ||
} | ||
|
||
} // cw:: |