Skip to content

Commit d5cdb62

Browse files
authored
Merge pull request #3639 from flairNLP/GH-3515-NER_DANSK
Add Danish NER corpus
2 parents 39ec21e + 46e8c95 commit d5cdb62

File tree

2 files changed

+107
-0
lines changed

2 files changed

+107
-0
lines changed

flair/datasets/__init__.py

+1
Original file line numberDiff line numberDiff line change
@@ -188,6 +188,7 @@
188188
NER_BAVARIAN_WIKI,
189189
NER_CHINESE_WEIBO,
190190
NER_DANISH_DANE,
191+
NER_DANISH_DANSK,
191192
NER_ENGLISH_MOVIE_COMPLEX,
192193
NER_ENGLISH_MOVIE_SIMPLE,
193194
NER_ENGLISH_PERSON,

flair/datasets/sequence_labeling.py

+106
Original file line numberDiff line numberDiff line change
@@ -5618,3 +5618,109 @@ def __init__(
56185618
label_name_map=label_name_map,
56195619
**corpusargs,
56205620
)
5621+
5622+
5623+
class NER_DANISH_DANSK(ColumnCorpus):
5624+
"""Danish NER dataset from the chcaa/dansk-ner HuggingFace dataset."""
5625+
5626+
def __init__(
5627+
self,
5628+
base_path: Optional[Union[str, Path]] = None,
5629+
in_memory: bool = True,
5630+
**corpusargs,
5631+
) -> None:
5632+
"""Initialize the dansk-ner corpus.
5633+
5634+
Args:
5635+
base_path: Path to the corpus on your machine
5636+
in_memory: If True, keeps dataset in memory giving speedups in training
5637+
corpusargs: Additional arguments for corpus initialization
5638+
"""
5639+
if base_path is None:
5640+
base_path = Path(flair.cache_root) / "datasets" / "ner_danish_dansk"
5641+
else:
5642+
base_path = Path(base_path)
5643+
5644+
# Create the corpus directory if it doesn't exist
5645+
base_path.mkdir(parents=True, exist_ok=True)
5646+
5647+
# Download dataset from HuggingFace and convert to CoNLL format
5648+
for split in ["train", "test", "validation"]:
5649+
conll_path = base_path / f"{split}.tsv"
5650+
5651+
# Only download and convert if file doesn't exist
5652+
if not conll_path.exists():
5653+
try:
5654+
from datasets import load_dataset
5655+
5656+
# Load the specific split from HuggingFace
5657+
ds = load_dataset("chcaa/dansk-ner")[split if split != "validation" else "dev"]
5658+
5659+
# Serialize the dataset to JSON for debugging
5660+
debug_json_path = base_path / f"{split}_debug.json"
5661+
import json
5662+
5663+
# Convert dataset to a list of dictionaries and save with nice formatting
5664+
dataset_for_json = [
5665+
{"text": item["text"], "tokens": item["tokens"], "ents": item["ents"]} for item in ds
5666+
]
5667+
5668+
with open(debug_json_path, "w", encoding="utf-8") as f_debug:
5669+
json.dump(dataset_for_json, f_debug, ensure_ascii=False, indent=2)
5670+
5671+
# Convert to CoNLL format
5672+
with open(conll_path, "w", encoding="utf-8") as f_out:
5673+
for example in ds:
5674+
text = example["text"] # Don't strip the text
5675+
tokens = example["tokens"]
5676+
ents = example["ents"]
5677+
5678+
# Create token-level tags (default to 'O')
5679+
tags = ["O"] * len(tokens)
5680+
5681+
# Assign BIO tags based on entity positions
5682+
for ent in ents:
5683+
start_char = ent["start"]
5684+
end_char = ent["end"]
5685+
ent_label = ent["label"]
5686+
5687+
# Find tokens that overlap with this entity
5688+
for i, token in enumerate(tokens):
5689+
token_start = token["start"]
5690+
token_end = token["end"]
5691+
5692+
# If token overlaps with entity
5693+
if token_start >= start_char and token_end <= end_char:
5694+
# First token gets B- prefix
5695+
if token_start == start_char:
5696+
tags[i] = f"B-{ent_label}"
5697+
# Subsequent tokens get I- prefix
5698+
else:
5699+
tags[i] = f"I-{ent_label}"
5700+
5701+
# Write tokens and tags
5702+
for token, tag in zip(tokens, tags):
5703+
token_text = text[token["start"] : token["end"]] # Don't strip the token
5704+
if token_text: # Still skip empty tokens
5705+
# Replace newlines with space in output to maintain CoNLL format
5706+
token_text = token_text.replace("\n", " ")
5707+
f_out.write(f"{token_text}\t{tag}\n")
5708+
5709+
# Empty line between sentences
5710+
f_out.write("\n")
5711+
5712+
except Exception as e:
5713+
print(f"Error downloading or converting dataset: {e}")
5714+
raise
5715+
5716+
# Initialize corpus using the converted files
5717+
super().__init__(
5718+
base_path,
5719+
column_format={0: "text", 1: "ner"},
5720+
train_file="train.tsv",
5721+
test_file="test.tsv",
5722+
dev_file="validation.tsv",
5723+
column_delimiter="\t",
5724+
in_memory=in_memory,
5725+
**corpusargs,
5726+
)

0 commit comments

Comments
 (0)