Skip to content

Commit

Permalink
Arabic Presentation Form Normalizer
Browse files Browse the repository at this point in the history
  • Loading branch information
adehad committed Nov 10, 2024
1 parent 7f58483 commit 53ae81c
Show file tree
Hide file tree
Showing 11 changed files with 471 additions and 6 deletions.
7 changes: 7 additions & 0 deletions .pre-commit-hooks.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -5,3 +5,10 @@
language: python
types: [text]
stages: [commit, push, manual]
- id: arabic-presentation-form
name: Arabic Presentation Form Normalizer
description: Replaces Arabic Presentation for and other contextual forms to default.
entry: arabic-presentation-form
language: python
types: [text]
stages: [commit, push, manual]
7 changes: 4 additions & 3 deletions .vscode/launch.json
Original file line number Diff line number Diff line change
Expand Up @@ -6,21 +6,22 @@
"configurations": [
{
"name": "pyModule",
"type": "python",
"type": "debugpy",
"request": "launch",
"module": "pre_commit_hooks.check_header_footer",
"console": "integratedTerminal",
"justMyCode": true
},
{
"name": "pytest",
"type": "python",
"type": "debugpy",
"request": "launch",
"module": "pytest",
"console": "integratedTerminal",
"args": [
"--no-cov", // disable as it affects breakpoints
"-vv", "-k",
"-vv",
"-k",
"" // add test function name here
],
"justMyCode": true
Expand Down
4 changes: 2 additions & 2 deletions .vscode/settings.json
Original file line number Diff line number Diff line change
Expand Up @@ -3,8 +3,8 @@
"editor.wordBasedSuggestions": "off",
"editor.defaultFormatter": "charliermarsh.ruff",
"editor.codeActionsOnSave": {
"source.fixAll": "explicit",
"source.organizeImports": "explicit"
"source.fixAll": "always",
"source.organizeImports": "always"
}
}
}
29 changes: 29 additions & 0 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -95,6 +95,35 @@ repos:
Future work:
1. Support a year parameter that can be used to apply fixes.
### arabic-presentation-form
Replace characters in Arabic Presentation form (A or B), and convert them into 'default' unicode characters.
One application is when using the 'Scheherazade New' font, and it does not support these characters.
Arguments:
- `--excluded-chars`: Regex of characters to exclude from being fixed.
- `--custom-rules`: Rules to update or override the tools inbuilt configuration. Format and example below:
```json
"RuleName": {"rule": {"ReplacementCharacter(s)": "RegexOfApplicableCharacter(s)"}}
"ʾalif": {"rule": {"\u0627": "(\ufe8d|\ufe8e)"}},
```

Example where we are extending the applicable file types and using a specific folder (all subfolders under `site/data`)

```yaml
repos:
- repo: https://github.com/adehad/pre-commit-hooks
rev: main
hooks:
- id: arabic-presentation-form
entry: arabic-presentation-form
language: python
types_or: [text, json, markdown]
args: [--excluded-chars, (ﷺ)]
files: ^site/data/
```

## Local Installation

```console
Expand Down
4 changes: 3 additions & 1 deletion pyproject.toml
Original file line number Diff line number Diff line change
Expand Up @@ -30,6 +30,7 @@ Source = "https://github.com/adehad/pre-commit-hooks/"

[project.scripts]
check-header-footer = "pre_commit_hooks.check_header_footer:main"
arabic-presentation-form = "pre_commit_hooks.arabic_presentation_form:main"

[tool.hatch.build]
sources = ["src"]
Expand Down Expand Up @@ -72,7 +73,8 @@ python = ["38", "39", "310", "311"]
# External Tool Config
########################################################################################
[tool.mypy]
python_version = 3.8
python_version = '3.8'
strict = true
ignore_missing_imports = true
namespace_packages = true
show_error_codes = true
Expand Down
182 changes: 182 additions & 0 deletions src/pre_commit_hooks/arabic_presentation_form/__init__.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,182 @@
"""Arabic Presentation Form."""

from __future__ import annotations

import functools
import pathlib
import re
import sys
from typing import Any, Dict, Sequence

from ..util import (
ABCArgs,
ABCHook,
ExitCode,
HashableDict,
load_json_source,
)
from . import char_map

sys.stdout.reconfigure(encoding="utf-8") # For Windows: we want to be sure to use UTF-8
RulesDict = Dict[re.Pattern[Any], str]


def apply_rules_to_lines(
line: str,
rules: RulesDict,
exclude: re.Pattern,
file_name: str,
line_no: str,
) -> tuple[ExitCode, str]:
"""Check the text for rules.
Args:
line (str): Line to check the rules.
rules (RulesDict): The rules to check form.
exclude (re.Pattern): characters to exclude from check.
file_name (str): the name of the file being checked.
line_no (int): The line number being checked.
Returns:
(ExitCode, str): (The PASS/FAIL state, The new line).
"""
exit_code = ExitCode.OK
new_line = exclude.sub(" ", line) # Replace with space to not affect col numbers

if not char_map.is_contains_non_general_form(max(new_line)):
return exit_code, line

new_chars: list[str] = []
exit_code = ExitCode.FAIL

for col_no, c in enumerate(line, start=1):
new_c = apply_rule(rules=HashableDict(rules), character=c)
new_c_as_unicode_hex = [f"\\u{ord(c):04x}" for c in new_c]
fix_char_loc = (
f"{file_name}:{line_no}:{col_no} [{new_c} ({new_c_as_unicode_hex})]"
)
if c != new_c:
output_str = f"[Fixed] {fix_char_loc}"
elif char_map.is_contains_non_general_form(new_c):
output_str = f"[Not Fixed] {fix_char_loc}"
else:
output_str = ""

if output_str:
print(output_str)
output_str = ""

new_chars.append(new_c)

new_line = "".join(new_chars)

return exit_code, new_line


def get_rules(custom_rules: dict[str, dict[str, str]]) -> RulesDict:
"""Return the rules from a given config string.
Args:
custom_rules (str): Any additional rules to apply.
Returns:
RulesDict: The compiles rules.
"""
regex_rules = {}
complete_rules: char_map.CHAR_MAP_TYPE = {}
complete_rules.update(char_map.CHAR_MAP)
complete_rules.update(custom_rules)
for _rule_name, char_mapping_rule in complete_rules.items():
for expected_out, expected_regex in char_mapping_rule["rule"].items():
regex_rules.update({re.compile(expected_regex): expected_out})
return regex_rules


@functools.lru_cache
def apply_rule(rules: RulesDict, character: str) -> str:
"""Apply the rule from the list of rules to the character.
Args:
rules (RulesDict): rules to apply for the character.
character (str): The letter/character to check against.
Returns:
str: The character after applying any rules.
"""
new_char = character
for reg_pattern, replace_char in rules.items():
if reg_pattern.match(character):
new_char = reg_pattern.sub(replace_char, character)
break
return new_char


class ArabicPresentationFormArgs(ABCArgs):
"""Args."""

excluded_chars: str
custom_rules: char_map.CHAR_MAP_TYPE


class ArabicPresentationFormChecker(ABCHook):
"""Checker for Header and Footer."""

def setup_parser(self) -> None:
"""Custom arguments."""
self.parser.add_argument(
"--excluded-chars",
type=str,
default="",
metavar="exclude-char-regex",
help="Regex for characters to exclude. e.g. (ﷺ)",
)
self.parser.add_argument(
"--custom-rules",
type=load_json_source,
default=dict(),
metavar="Path-OR-JSON-String",
help=(
'"RuleName": {"rule": {"ReplacementCharacter(s)": "RegexOfApplicableCharacter(s)"}}' # noqa: E501
'. e.g. "ʾalif": {"rule": {"\u0627": "(\ufe8d|\ufe8e)"}},' # noqa: RUF001
+ ". To exclude a unicode character, overwrite its default entry."
),
)

def implementation(
self,
file_name: pathlib.Path,
args: ArabicPresentationFormArgs,
) -> ExitCode:
"""Check Implementation."""
exit_code = int(ExitCode.OK)
exclude_regex = re.compile(args.excluded_chars)

new_file_lines = []
with file_name.open("r", encoding="utf-8") as f:
for line_no, line in enumerate(iter(f.readlines()), start=1):
intermediate_exit_code, new_line = apply_rules_to_lines(
line=line,
line_no=line_no,
file_name=file_name,
rules=get_rules(args.custom_rules),
exclude=exclude_regex,
)
exit_code |= intermediate_exit_code

if char_map.is_contains_non_general_form(
max(exclude_regex.sub("", new_line) or " ")
):
print(f"Incomplete Fixes Applied: {file_name}:{line_no}")

new_file_lines.append(new_line)

with file_name.open("w", encoding="utf-8") as f:
f.writelines(new_file_lines)
return ExitCode(exit_code)


def main(argv: Sequence[str] | None = None) -> int:
"""Main entrypoint."""
argparser = ArabicPresentationFormChecker()
return argparser.run(argv=argv)
8 changes: 8 additions & 0 deletions src/pre_commit_hooks/arabic_presentation_form/__main__.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,8 @@
"""Arabic Presentation Form Hook."""

from __future__ import annotations

from . import main

if __name__ == "__main__":
raise SystemExit(main())
Loading

0 comments on commit 53ae81c

Please sign in to comment.