Skip to content

xy_cut_enhanced #3806

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Merged
merged 1 commit into from
Apr 24, 2025
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
138 changes: 72 additions & 66 deletions paddlex/inference/models/formula_recognition/processors.py
Original file line number Diff line number Diff line change
Expand Up @@ -631,74 +631,80 @@ def __init__(
self.pad_token_type_id = 0
self.pad_to_multiple_of = None

temp_path = tempfile.gettempdir()
fast_tokenizer_file = os.path.join(temp_path, "tokenizer.json")
tokenizer_config_file = os.path.join(temp_path, "tokenizer_config.json")
try:
with open(fast_tokenizer_file, "w") as f:
json.dump(character_list["fast_tokenizer_file"], f)
with open(tokenizer_config_file, "w") as f:
json.dump(character_list["tokenizer_config_file"], f)
except Exception as e:
print(
f"创建 tokenizer.json 和 tokenizer_config.json 文件失败, 原因{str(e)}"
)

self.tokenizer = TokenizerFast.from_file(fast_tokenizer_file)
added_tokens_decoder = {}
added_tokens_map = {}
if tokenizer_config_file is not None:
with open(
tokenizer_config_file, encoding="utf-8"
) as tokenizer_config_handle:
init_kwargs = json.load(tokenizer_config_handle)
if "added_tokens_decoder" in init_kwargs:
for idx, token in init_kwargs["added_tokens_decoder"].items():
if isinstance(token, dict):
token = AddedToken(**token)
if isinstance(token, AddedToken):
added_tokens_decoder[int(idx)] = token
added_tokens_map[str(token)] = token
else:
raise ValueError(
f"Found a {token.__class__} in the saved `added_tokens_decoder`, should be a dictionary or an AddedToken instance"
)
init_kwargs["added_tokens_decoder"] = added_tokens_decoder
added_tokens_decoder = init_kwargs.pop("added_tokens_decoder", {})
tokens_to_add = [
token
for index, token in sorted(
added_tokens_decoder.items(), key=lambda x: x[0]
)
if token not in added_tokens_decoder
]
added_tokens_encoder = self.added_tokens_encoder(added_tokens_decoder)
encoder = list(added_tokens_encoder.keys()) + [
str(token) for token in tokens_to_add
]
tokens_to_add += [
token
for token in self.all_special_tokens_extended
if token not in encoder and token not in tokens_to_add
]
if len(tokens_to_add) > 0:
is_last_special = None
tokens = []
special_tokens = self.all_special_tokens
for token in tokens_to_add:
is_special = (
(token.special or str(token) in special_tokens)
if isinstance(token, AddedToken)
else str(token) in special_tokens
with tempfile.NamedTemporaryFile(
mode="w", suffix=".json", delete=True
) as temp_file1, tempfile.NamedTemporaryFile(
mode="w", suffix=".json", delete=True
) as temp_file2:
fast_tokenizer_file = temp_file1.name
tokenizer_config_file = temp_file2.name
try:
with open(fast_tokenizer_file, "w") as f:
json.dump(character_list["fast_tokenizer_file"], f)
with open(tokenizer_config_file, "w") as f:
json.dump(character_list["tokenizer_config_file"], f)
except Exception as e:
print(
f"创建 tokenizer.json 和 tokenizer_config.json 文件失败, 原因{str(e)}"
)

self.tokenizer = TokenizerFast.from_file(fast_tokenizer_file)
added_tokens_decoder = {}
added_tokens_map = {}
if tokenizer_config_file is not None:
with open(
tokenizer_config_file, encoding="utf-8"
) as tokenizer_config_handle:
init_kwargs = json.load(tokenizer_config_handle)
if "added_tokens_decoder" in init_kwargs:
for idx, token in init_kwargs["added_tokens_decoder"].items():
if isinstance(token, dict):
token = AddedToken(**token)
if isinstance(token, AddedToken):
added_tokens_decoder[int(idx)] = token
added_tokens_map[str(token)] = token
else:
raise ValueError(
f"Found a {token.__class__} in the saved `added_tokens_decoder`, should be a dictionary or an AddedToken instance"
)
init_kwargs["added_tokens_decoder"] = added_tokens_decoder
added_tokens_decoder = init_kwargs.pop("added_tokens_decoder", {})
tokens_to_add = [
token
for index, token in sorted(
added_tokens_decoder.items(), key=lambda x: x[0]
)
if is_last_special is None or is_last_special == is_special:
tokens.append(token)
else:
if token not in added_tokens_decoder
]
added_tokens_encoder = self.added_tokens_encoder(
added_tokens_decoder
)
encoder = list(added_tokens_encoder.keys()) + [
str(token) for token in tokens_to_add
]
tokens_to_add += [
token
for token in self.all_special_tokens_extended
if token not in encoder and token not in tokens_to_add
]
if len(tokens_to_add) > 0:
is_last_special = None
tokens = []
special_tokens = self.all_special_tokens
for token in tokens_to_add:
is_special = (
(token.special or str(token) in special_tokens)
if isinstance(token, AddedToken)
else str(token) in special_tokens
)
if is_last_special is None or is_last_special == is_special:
tokens.append(token)
else:
self._add_tokens(tokens, special_tokens=is_last_special)
tokens = [token]
is_last_special = is_special
if tokens:
self._add_tokens(tokens, special_tokens=is_last_special)
tokens = [token]
is_last_special = is_special
if tokens:
self._add_tokens(tokens, special_tokens=is_last_special)

def _add_tokens(
self, new_tokens: "List[Union[AddedToken, str]]", special_tokens: bool = False
Expand Down
4 changes: 2 additions & 2 deletions paddlex/inference/pipelines/layout_parsing/pipeline.py
Original file line number Diff line number Diff line change
Expand Up @@ -240,10 +240,10 @@ def get_layout_parsing_res(
)
seal_index += 1
else:
ocr_res_in_box, matched_idxs = get_sub_regions_ocr_res(
ocr_res_in_box, matched_idxes = get_sub_regions_ocr_res(
overall_ocr_res, [box], return_match_idx=True
)
for matched_idx in matched_idxs:
for matched_idx in matched_idxes:
if matched_ocr_dict.get(matched_idx, None) is None:
matched_ocr_dict[matched_idx] = [object_box_idx]
else:
Expand Down
Loading