@@ -5618,3 +5618,109 @@ def __init__(
5618
5618
label_name_map = label_name_map ,
5619
5619
** corpusargs ,
5620
5620
)
5621
+
5622
+
5623
+ class NER_DANISH_DANSK (ColumnCorpus ):
5624
+ """Danish NER dataset from the chcaa/dansk-ner HuggingFace dataset."""
5625
+
5626
+ def __init__ (
5627
+ self ,
5628
+ base_path : Optional [Union [str , Path ]] = None ,
5629
+ in_memory : bool = True ,
5630
+ ** corpusargs ,
5631
+ ) -> None :
5632
+ """Initialize the dansk-ner corpus.
5633
+
5634
+ Args:
5635
+ base_path: Path to the corpus on your machine
5636
+ in_memory: If True, keeps dataset in memory giving speedups in training
5637
+ corpusargs: Additional arguments for corpus initialization
5638
+ """
5639
+ if base_path is None :
5640
+ base_path = Path (flair .cache_root ) / "datasets" / "ner_danish_dansk"
5641
+ else :
5642
+ base_path = Path (base_path )
5643
+
5644
+ # Create the corpus directory if it doesn't exist
5645
+ base_path .mkdir (parents = True , exist_ok = True )
5646
+
5647
+ # Download dataset from HuggingFace and convert to CoNLL format
5648
+ for split in ["train" , "test" , "validation" ]:
5649
+ conll_path = base_path / f"{ split } .tsv"
5650
+
5651
+ # Only download and convert if file doesn't exist
5652
+ if not conll_path .exists ():
5653
+ try :
5654
+ from datasets import load_dataset
5655
+
5656
+ # Load the specific split from HuggingFace
5657
+ ds = load_dataset ("chcaa/dansk-ner" )[split if split != "validation" else "dev" ]
5658
+
5659
+ # Serialize the dataset to JSON for debugging
5660
+ debug_json_path = base_path / f"{ split } _debug.json"
5661
+ import json
5662
+
5663
+ # Convert dataset to a list of dictionaries and save with nice formatting
5664
+ dataset_for_json = [
5665
+ {"text" : item ["text" ], "tokens" : item ["tokens" ], "ents" : item ["ents" ]} for item in ds
5666
+ ]
5667
+
5668
+ with open (debug_json_path , "w" , encoding = "utf-8" ) as f_debug :
5669
+ json .dump (dataset_for_json , f_debug , ensure_ascii = False , indent = 2 )
5670
+
5671
+ # Convert to CoNLL format
5672
+ with open (conll_path , "w" , encoding = "utf-8" ) as f_out :
5673
+ for example in ds :
5674
+ text = example ["text" ] # Don't strip the text
5675
+ tokens = example ["tokens" ]
5676
+ ents = example ["ents" ]
5677
+
5678
+ # Create token-level tags (default to 'O')
5679
+ tags = ["O" ] * len (tokens )
5680
+
5681
+ # Assign BIO tags based on entity positions
5682
+ for ent in ents :
5683
+ start_char = ent ["start" ]
5684
+ end_char = ent ["end" ]
5685
+ ent_label = ent ["label" ]
5686
+
5687
+ # Find tokens that overlap with this entity
5688
+ for i , token in enumerate (tokens ):
5689
+ token_start = token ["start" ]
5690
+ token_end = token ["end" ]
5691
+
5692
+ # If token overlaps with entity
5693
+ if token_start >= start_char and token_end <= end_char :
5694
+ # First token gets B- prefix
5695
+ if token_start == start_char :
5696
+ tags [i ] = f"B-{ ent_label } "
5697
+ # Subsequent tokens get I- prefix
5698
+ else :
5699
+ tags [i ] = f"I-{ ent_label } "
5700
+
5701
+ # Write tokens and tags
5702
+ for token , tag in zip (tokens , tags ):
5703
+ token_text = text [token ["start" ] : token ["end" ]] # Don't strip the token
5704
+ if token_text : # Still skip empty tokens
5705
+ # Replace newlines with space in output to maintain CoNLL format
5706
+ token_text = token_text .replace ("\n " , " " )
5707
+ f_out .write (f"{ token_text } \t { tag } \n " )
5708
+
5709
+ # Empty line between sentences
5710
+ f_out .write ("\n " )
5711
+
5712
+ except Exception as e :
5713
+ print (f"Error downloading or converting dataset: { e } " )
5714
+ raise
5715
+
5716
+ # Initialize corpus using the converted files
5717
+ super ().__init__ (
5718
+ base_path ,
5719
+ column_format = {0 : "text" , 1 : "ner" },
5720
+ train_file = "train.tsv" ,
5721
+ test_file = "test.tsv" ,
5722
+ dev_file = "validation.tsv" ,
5723
+ column_delimiter = "\t " ,
5724
+ in_memory = in_memory ,
5725
+ ** corpusargs ,
5726
+ )
0 commit comments