Skip to content

Text classification doccano #11

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Draft
wants to merge 2 commits into
base: master
Choose a base branch
from
Draft
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
58 changes: 49 additions & 9 deletions doccano_transformer/datasets.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,17 +2,19 @@
import json
from typing import Any, Callable, Iterable, Iterator, List, Optional, TextIO

from doccano_transformer.examples import Example, NERExample
from doccano_transformer.examples import (Example, NERExample,
TextClassificationExample)
from doccano_transformer.utils import read_labels


class Dataset:
def __init__(
self,
filepath: str,
encoding: Optional[str] = 'utf-8',
transformation_func: Optional[Callable[[TextIO], Iterable[Any]]] = None
self,
filepath: str,
encoding: Optional[str] = 'utf-8',
transformation_func: Optional[
Callable[[TextIO], Iterable[Any]]] = None
) -> None:

self.filepath = filepath
self.encoding = encoding
self.transformation_func = transformation_func or (lambda x: x)
Expand All @@ -29,7 +31,7 @@ def from_jsonl(

@classmethod
def from_csv(
cls, filepath: str, encoding: Optional[str] = 'utf-8'
cls, filepath: str, encoding: Optional[str] = 'utf-8'
) -> 'Dataset':
return cls(filepath, encoding, csv.DictReader)

Expand All @@ -48,13 +50,51 @@ class NERDataset(TaskDataset):
example_class = NERExample

def to_conll2003(
self, tokenizer: Callable[[str], List[str]]
self, tokenizer: Callable[[str], List[str]]
) -> Iterator[str]:
for example in self:
yield from example.to_conll2003(tokenizer)

def to_spacy(
self, tokenizer: Callable[[str], List[str]]
self, tokenizer: Callable[[str], List[str]]
) -> Iterator[dict]:
for example in self:
yield from example.to_spacy(tokenizer)


class TextClassificationDataset(TaskDataset):
example_class = TextClassificationExample

def __init__(self, filepath: str, labels_filepath: str, encoding: Optional[
str] = 'utf-8',
transformation_func: Optional[
Callable[[TextIO], Iterable[Any]]] = None) -> None:
"""Dataset for converting text classification annotations

Args:
filepath: path to exported annotations
labels_filepath: path to exported label metadata
encoding: encoding of the annotation file
transformation_func: additional tranformation function
"""
super().__init__(filepath, encoding, transformation_func)
self.labels_filepath = labels_filepath
self.labels = read_labels(self.labels_filepath)

@classmethod
def from_jsonl(
cls, filepath: str, encoding: Optional[str] = 'utf-8', **kwargs
) -> 'Dataset':
labels_filepath = kwargs.get('labels_filepath')
return cls(filepath, labels_filepath, encoding, lambda f: map(
json.loads, f))

def __iter__(self) -> Iterator[Example]:
for raw in super(TextClassificationDataset, self).__iter__():
example = self.example_class(raw, self.labels)
example.is_valid(raise_exception=True)
yield example

def to_fasttext(self) -> Iterator[str]:
for example in self:
yield example.to_fasttext()
36 changes: 32 additions & 4 deletions doccano_transformer/examples.py
Original file line number Diff line number Diff line change
@@ -1,5 +1,6 @@
import os
from collections import defaultdict
from typing import Callable, Iterator, List, Optional
from typing import Callable, Dict, Iterator, List, Optional

from spacy.gold import biluo_tags_from_offsets

Expand Down Expand Up @@ -58,7 +59,7 @@ def is_valid(self, raise_exception: Optional[bool] = True) -> bool:
return True

def to_conll2003(
self, tokenizer: Callable[[str], List[str]]
self, tokenizer: Callable[[str], List[str]]
) -> Iterator[dict]:
all_tokens, all_token_offsets = self.get_tokens_and_token_offsets(
tokenizer)
Expand All @@ -79,7 +80,7 @@ def to_conll2003(
yield {'user': user, 'data': ''.join(lines)}

def to_spacy(
self, tokenizer: Callable[[str], List[str]]
self, tokenizer: Callable[[str], List[str]]
) -> Iterator[dict]:
all_tokens, all_token_offsets = self.get_tokens_and_token_offsets(
tokenizer)
Expand All @@ -101,11 +102,38 @@ def to_spacy(
tags = biluo_tags_from_offsets(tokens, label)
tokens_for_spacy = []
for i, (token, tag, offset) in enumerate(
zip(tokens, tags, offsets)
zip(tokens, tags, offsets)
):
tokens_for_spacy.append(
{'id': i, 'orth': str(token), 'ner': tag}
)
sentences.append({'tokens': tokens_for_spacy})
data['sentences'] = sentences
yield {'user': user, 'data': {'id': self.id, 'paragraphs': [data]}}


class TextClassificationExample(Example):

def __init__(self, raw, labels: Dict) -> None:
"""Example class for text classification projects

Args:
raw: example in a for of dict
labels: mapping of labels from id to text
"""
self.raw = raw
self.labels = labels
self.annotations = self.raw['annotations']

def is_valid(self, raise_exception: Optional[bool] = True) -> None:
return True

def _append_label_text(self, label_id: int) -> str:
return f'__label__{self.labels[label_id]} '

def _create_label_tags(self):
return ''.join(self._append_label_text(annotation['label'])
for annotation in self.annotations)

def to_fasttext(self):
return self._create_label_tags() + self.raw['text'] + os.linesep
13 changes: 11 additions & 2 deletions doccano_transformer/utils.py
Original file line number Diff line number Diff line change
@@ -1,4 +1,5 @@
from typing import TYPE_CHECKING, List, Optional, Tuple
import json
from typing import TYPE_CHECKING, Dict, List, Optional, Tuple

if TYPE_CHECKING:
from doccano_transformer.datasets import Dataset
Expand Down Expand Up @@ -99,7 +100,7 @@ def __str__(self):


def convert_tokens_and_offsets_to_spacy_tokens(
tokens: List[str], offsets: List[int]
tokens: List[str], offsets: List[int]
) -> List[Token]:
"""Convert tokens and offsets to the list of SpaCy compatible object.

Expand All @@ -120,3 +121,11 @@ def convert_tokens_and_offsets_to_spacy_tokens(
for i, (token, offset) in enumerate(zip(tokens, offsets)):
spacy_tokens.append(Token(token, offset, i))
return spacy_tokens


def read_labels(labels_filepath: str) -> Dict:
labels_doccano = json.load(open(labels_filepath, mode='r'))
labels_mapping = {}
for label in labels_doccano:
labels_mapping[label['id']] = label['text']
return labels_mapping