Skip to content

Commit fd5d1ad

Browse files
authored
Merge pull request #3624 from flairNLP/fix_opus_corpus
fix opus corpus and fix ruff errors
2 parents ab698dc + fde4a57 commit fd5d1ad

20 files changed

+203
-207
lines changed

flair/__init__.py

+5-5
Original file line numberDiff line numberDiff line change
@@ -85,16 +85,16 @@ def set_seed(seed: int):
8585

8686

8787
__all__ = [
88+
"__version__",
8889
"cache_root",
90+
"data",
91+
"datasets",
8992
"device",
90-
"__version__",
9193
"logger",
92-
"set_seed",
93-
"data",
9494
"models",
9595
"nn",
96+
"set_proxies",
97+
"set_seed",
9698
"trainers",
9799
"visual",
98-
"datasets",
99-
"set_proxies",
100100
]

flair/class_utils.py

+2-3
Original file line numberDiff line numberDiff line change
@@ -2,8 +2,7 @@
22
import inspect
33
from collections.abc import Iterable
44
from types import ModuleType
5-
from typing import Any, Iterable, List, Optional, Protocol, Type, TypeVar, Union, overload
6-
5+
from typing import Any, Optional, Protocol, TypeVar, Union, overload
76

87
T = TypeVar("T")
98

@@ -12,7 +11,7 @@ class StringLike(Protocol):
1211
def __str__(self) -> str: ...
1312

1413

15-
def get_non_abstract_subclasses(cls: Type[T]) -> Iterable[Type[T]]:
14+
def get_non_abstract_subclasses(cls: type[T]) -> Iterable[type[T]]:
1615
for subclass in cls.__subclasses__():
1716
yield from get_non_abstract_subclasses(subclass)
1817
if inspect.isabstract(subclass):

flair/data.py

+4-7
Original file line numberDiff line numberDiff line change
@@ -1389,11 +1389,12 @@ def __init__(
13891389
sample_missing_splits: Union[bool, str] = True,
13901390
random_seed: Optional[int] = None,
13911391
) -> None:
1392-
"""Constructor method to initialize a :class:`Corpus`. You can define the train, dev and test split
1392+
"""Initialize a Corpus.
1393+
1394+
You can define the train, dev and test split
13931395
by passing the corresponding Dataset object to the constructor. At least one split should be defined.
13941396
If the option `sample_missing_splits` is set to True, missing splits will be randomly sampled from the
13951397
train split.
1396-
13971398
In most cases, you will not use the constructor yourself. Rather, you will create a corpus using one of our
13981399
helper methods that read common NLP filetypes. For instance, you can use
13991400
:class:`flair.datasets.sequence_labeling.ColumnCorpus` to read CoNLL-formatted files directly into
@@ -1679,11 +1680,7 @@ def _count_token_labels(sentences: Iterable[Sentence], label_type: str) -> defau
16791680
return label_count
16801681

16811682
def __str__(self) -> str:
1682-
return "Corpus: %d train + %d dev + %d test sentences" % (
1683-
_len_dataset(self.train) if self.train else 0,
1684-
_len_dataset(self.dev) if self.dev else 0,
1685-
_len_dataset(self.test) if self.test else 0,
1686-
)
1683+
return f"Corpus: {_len_dataset(self.train) if self.train else 0} train + {_len_dataset(self.dev) if self.dev else 0} dev + {_len_dataset(self.test) if self.test else 0} test sentences"
16871684

16881685
def make_label_dictionary(
16891686
self, label_type: str, min_count: int = -1, add_unk: bool = False, add_dev_test: bool = False

flair/datasets/__init__.py

+103-103
Original file line numberDiff line numberDiff line change
@@ -324,19 +324,11 @@
324324
)
325325

326326
__all__ = [
327-
"DataLoader",
328-
"OcrJsonDataset",
329-
"SROIE",
330-
"FlairDatapointDataset",
331-
"SentenceDataset",
332-
"MongoDataset",
333-
"StringDataset",
334-
"EntityLinkingDictionary",
335327
"AGNEWS",
328+
"AMAZON_REVIEWS",
336329
"ANAT_EM",
337330
"AZDZ",
338331
"BC2GM",
339-
"BIO_INFER",
340332
"BIOBERT_CHEMICAL_BC4CHEMD",
341333
"BIOBERT_CHEMICAL_BC5CDR",
342334
"BIOBERT_DISEASE_BC5CDR",
@@ -347,19 +339,41 @@
347339
"BIOBERT_SPECIES_S800",
348340
"BIONLP2013_CG",
349341
"BIONLP2013_PC",
342+
"BIOSCOPE",
350343
"BIOSEMANTICS",
344+
"BIO_INFER",
351345
"CDR",
352346
"CELL_FINDER",
353347
"CEMP",
354348
"CHEMDNER",
349+
"CLEANCONLL",
355350
"CLL",
351+
"COMMUNICATIVE_FUNCTIONS",
352+
"CONLL_03",
353+
"CONLL_03_DUTCH",
354+
"CONLL_03_GERMAN",
355+
"CONLL_03_SPANISH",
356+
"CONLL_2000",
356357
"CRAFT",
357358
"CRAFT_V4",
359+
"CTD_CHEMICALS_DICTIONARY",
360+
"CTD_DISEASES_DICTIONARY",
358361
"DECA",
362+
"FEWNERD",
359363
"FSU",
360364
"GELLUS",
365+
"GERMEVAL_2018_OFFENSIVE_LANGUAGE",
366+
"GLUE_COLA",
367+
"GLUE_MNLI",
368+
"GLUE_MRPC",
369+
"GLUE_QNLI",
370+
"GLUE_QQP",
371+
"GLUE_RTE",
372+
"GLUE_SST2",
373+
"GLUE_STSB",
374+
"GLUE_WNLI",
375+
"GO_EMOTIONS",
361376
"GPRO",
362-
"HunerEntityLinkingDictionary",
363377
"HUNER_CELL_LINE",
364378
"HUNER_CELL_LINE_CELL_FINDER",
365379
"HUNER_CELL_LINE_CLL",
@@ -404,77 +418,24 @@
404418
"HUNER_SPECIES_S800",
405419
"HUNER_SPECIES_VARIOME",
406420
"IEPA",
421+
"IMDB",
407422
"JNLPBA",
423+
"KEYPHRASE_INSPEC",
424+
"KEYPHRASE_SEMEVAL2010",
425+
"KEYPHRASE_SEMEVAL2017",
408426
"LINNEAUS",
409427
"LOCTEXT",
428+
"MASAKHA_POS",
410429
"MIRNA",
430+
"NCBI_DISEASE",
411431
"NCBI_GENE_HUMAN_DICTIONARY",
412432
"NCBI_TAXONOMY_DICTIONARY",
413-
"CTD_DISEASES_DICTIONARY",
414-
"CTD_CHEMICALS_DICTIONARY",
415-
"NCBI_DISEASE",
416-
"ONTONOTES",
417-
"OSIRIS",
418-
"PDR",
419-
"S800",
420-
"SCAI_CHEMICALS",
421-
"SCAI_DISEASE",
422-
"VARIOME",
423-
"AMAZON_REVIEWS",
424-
"COMMUNICATIVE_FUNCTIONS",
425-
"GERMEVAL_2018_OFFENSIVE_LANGUAGE",
426-
"GLUE_COLA",
427-
"GO_EMOTIONS",
428-
"IMDB",
429-
"NEWSGROUPS",
430-
"STACKOVERFLOW",
431-
"SENTEVAL_CR",
432-
"SENTEVAL_MPQA",
433-
"SENTEVAL_MR",
434-
"SENTEVAL_SST_BINARY",
435-
"SENTEVAL_SST_GRANULAR",
436-
"SENTEVAL_SUBJ",
437-
"SENTIMENT_140",
438-
"TREC_6",
439-
"TREC_50",
440-
"WASSA_ANGER",
441-
"WASSA_FEAR",
442-
"WASSA_JOY",
443-
"WASSA_SADNESS",
444-
"YAHOO_ANSWERS",
445-
"ClassificationCorpus",
446-
"ClassificationDataset",
447-
"CSVClassificationCorpus",
448-
"CSVClassificationDataset",
449433
"NEL_ENGLISH_AIDA",
450434
"NEL_ENGLISH_AQUAINT",
451435
"NEL_ENGLISH_IITB",
452436
"NEL_ENGLISH_REDDIT",
453437
"NEL_ENGLISH_TWEEKI",
454438
"NEL_GERMAN_HIPE",
455-
"WSD_MASC",
456-
"WSD_OMSTI",
457-
"WSD_RAGANATO_ALL",
458-
"WSD_SEMCOR",
459-
"WSD_TRAINOMATIC",
460-
"WSD_UFSAC",
461-
"WSD_WORDNET_GLOSS_TAGGED",
462-
"RE_ENGLISH_CONLL04",
463-
"RE_ENGLISH_DRUGPROT",
464-
"RE_ENGLISH_SEMEVAL2010",
465-
"RE_ENGLISH_TACRED",
466-
"BIOSCOPE",
467-
"CONLL_03",
468-
"CONLL_03_DUTCH",
469-
"CONLL_03_GERMAN",
470-
"CONLL_03_SPANISH",
471-
"CLEANCONLL",
472-
"CONLL_2000",
473-
"FEWNERD",
474-
"KEYPHRASE_INSPEC",
475-
"KEYPHRASE_SEMEVAL2010",
476-
"KEYPHRASE_SEMEVAL2017",
477-
"MASAKHA_POS",
478439
"NER_ARABIC_ANER",
479440
"NER_ARABIC_AQMAR",
480441
"NER_BASQUE",
@@ -491,6 +452,7 @@
491452
"NER_ENGLISH_WEBPAGES",
492453
"NER_ENGLISH_WIKIGOLD",
493454
"NER_ENGLISH_WNUT_2020",
455+
"NER_ESTONIAN_NOISY",
494456
"NER_FINNISH",
495457
"NER_GERMAN_BIOFID",
496458
"NER_GERMAN_EUROPARL",
@@ -499,51 +461,44 @@
499461
"NER_GERMAN_MOBIE",
500462
"NER_GERMAN_POLITICS",
501463
"NER_HIPE_2022",
502-
"NER_NOISEBENCH",
503464
"NER_HUNGARIAN",
504465
"NER_ICDAR_EUROPEANA",
505466
"NER_ICELANDIC",
506467
"NER_JAPANESE",
507-
"NER_NERMUD",
508468
"NER_MASAKHANE",
469+
"NER_MULTI_CONER",
470+
"NER_MULTI_CONER_V2",
509471
"NER_MULTI_WIKIANN",
510472
"NER_MULTI_WIKINER",
511473
"NER_MULTI_XTREME",
474+
"NER_NERMUD",
475+
"NER_NOISEBENCH",
512476
"NER_SWEDISH",
513477
"NER_TURKU",
514478
"NER_UKRAINIAN",
515-
"NER_ESTONIAN_NOISY",
516-
"UP_CHINESE",
517-
"UP_ENGLISH",
518-
"UP_FINNISH",
519-
"UP_FRENCH",
520-
"UP_GERMAN",
521-
"UP_ITALIAN",
522-
"UP_SPANISH",
523-
"UP_SPANISH_ANCORA",
524-
"WNUT_17",
525-
"ColumnCorpus",
526-
"ColumnDataset",
527-
"NER_MULTI_CONER",
528-
"NER_MULTI_CONER_V2",
529-
"FeideggerCorpus",
530-
"FeideggerDataset",
531-
"GLUE_MNLI",
532-
"GLUE_MRPC",
533-
"GLUE_QNLI",
534-
"GLUE_QQP",
535-
"GLUE_RTE",
536-
"GLUE_WNLI",
537-
"GLUE_SST2",
538-
"GLUE_STSB",
479+
"NEWSGROUPS",
480+
"ONTONOTES",
481+
"OSIRIS",
482+
"PDR",
483+
"RE_ENGLISH_CONLL04",
484+
"RE_ENGLISH_DRUGPROT",
485+
"RE_ENGLISH_SEMEVAL2010",
486+
"RE_ENGLISH_TACRED",
487+
"S800",
488+
"SCAI_CHEMICALS",
489+
"SCAI_DISEASE",
490+
"SENTEVAL_CR",
491+
"SENTEVAL_MPQA",
492+
"SENTEVAL_MR",
493+
"SENTEVAL_SST_BINARY",
494+
"SENTEVAL_SST_GRANULAR",
495+
"SENTEVAL_SUBJ",
496+
"SENTIMENT_140",
497+
"SROIE",
498+
"STACKOVERFLOW",
539499
"SUPERGLUE_RTE",
540-
"DataPairCorpus",
541-
"DataPairDataset",
542-
"DataTripleCorpus",
543-
"DataTripleDataset",
544-
"OpusParallelCorpus",
545-
"ParallelTextCorpus",
546-
"ParallelTextDataset",
500+
"TREC_6",
501+
"TREC_50",
547502
"UD_AFRIKAANS",
548503
"UD_ANCIENT_GREEK",
549504
"UD_ARABIC",
@@ -603,7 +558,52 @@
603558
"UD_TURKISH",
604559
"UD_UKRAINIAN",
605560
"UD_WOLOF",
561+
"UP_CHINESE",
562+
"UP_ENGLISH",
563+
"UP_FINNISH",
564+
"UP_FRENCH",
565+
"UP_GERMAN",
566+
"UP_ITALIAN",
567+
"UP_SPANISH",
568+
"UP_SPANISH_ANCORA",
569+
"VARIOME",
570+
"WASSA_ANGER",
571+
"WASSA_FEAR",
572+
"WASSA_JOY",
573+
"WASSA_SADNESS",
574+
"WNUT_17",
575+
"WSD_MASC",
576+
"WSD_OMSTI",
577+
"WSD_RAGANATO_ALL",
578+
"WSD_SEMCOR",
579+
"WSD_TRAINOMATIC",
580+
"WSD_UFSAC",
581+
"WSD_WORDNET_GLOSS_TAGGED",
582+
"YAHOO_ANSWERS",
583+
"ZELDA",
584+
"CSVClassificationCorpus",
585+
"CSVClassificationDataset",
586+
"ClassificationCorpus",
587+
"ClassificationDataset",
588+
"ColumnCorpus",
589+
"ColumnDataset",
590+
"DataLoader",
591+
"DataPairCorpus",
592+
"DataPairDataset",
593+
"DataTripleCorpus",
594+
"DataTripleDataset",
595+
"EntityLinkingDictionary",
596+
"FeideggerCorpus",
597+
"FeideggerDataset",
598+
"FlairDatapointDataset",
599+
"HunerEntityLinkingDictionary",
600+
"MongoDataset",
601+
"OcrJsonDataset",
602+
"OpusParallelCorpus",
603+
"ParallelTextCorpus",
604+
"ParallelTextDataset",
605+
"SentenceDataset",
606+
"StringDataset",
606607
"UniversalDependenciesCorpus",
607608
"UniversalDependenciesDataset",
608-
"ZELDA",
609609
]

flair/datasets/biomedical.py

+2-4
Original file line numberDiff line numberDiff line change
@@ -1104,7 +1104,7 @@ def parse_file(cls, input_file: Path, split: str, sentence_separator: str) -> In
11041104
document_text += sentence_separator
11051105

11061106
sentence_offset = len(document_text)
1107-
document_text += sentence.get("text") if document_text else sentence.get("text")
1107+
document_text += sentence.get("text")
11081108

11091109
for entity in sentence.xpath(".//entity"):
11101110
start, end = entity.get("charOffset").split("-")
@@ -4146,7 +4146,7 @@ def download_corpus(download_folder: Path) -> tuple[Path, Path, Path]:
41464146

41474147
@deprecated(
41484148
version="0.13",
4149-
reason='Please use BIGBIO_NER_CORPUS implementation by calling ´corpus = BIGBIO_NER_CORPUS("bigbio/anat_em", trust_remote_code=True)´',
4149+
reason='Please use BIGBIO_NER_CORPUS implementation by calling `corpus = BIGBIO_NER_CORPUS("bigbio/anat_em", trust_remote_code=True)`',
41504150
)
41514151
class ANAT_EM(ColumnCorpus):
41524152
"""Corpus for anatomical named entity mention recognition.
@@ -4157,8 +4157,6 @@ class ANAT_EM(ColumnCorpus):
41574157
http://nactem.ac.uk/anatomytagger/#AnatEM
41584158
"""
41594159

4160-
pass
4161-
41624160

41634161
class BioBertHelper(ColumnCorpus):
41644162
"""Helper class to convert corpora and the respective train, dev and test split used by BioBERT.

0 commit comments

Comments
 (0)