doccano · prokotg · May 17, 2020 · May 17, 2020
diff --git a/doccano_transformer/datasets.py b/doccano_transformer/datasets.py
@@ -2,17 +2,19 @@
 import json
 from typing import Any, Callable, Iterable, Iterator, List, Optional, TextIO
 
-from doccano_transformer.examples import Example, NERExample
+from doccano_transformer.examples import (Example, NERExample,
+                                          TextClassificationExample)
+from doccano_transformer.utils import read_labels
 
 
 class Dataset:
     def __init__(
-        self,
-        filepath: str,
-        encoding: Optional[str] = 'utf-8',
-        transformation_func: Optional[Callable[[TextIO], Iterable[Any]]] = None
+            self,
+            filepath: str,
+            encoding: Optional[str] = 'utf-8',
+            transformation_func: Optional[
+                Callable[[TextIO], Iterable[Any]]] = None
     ) -> None:
-
         self.filepath = filepath
         self.encoding = encoding
         self.transformation_func = transformation_func or (lambda x: x)
@@ -29,7 +31,7 @@ def from_jsonl(
 
     @classmethod
     def from_csv(
-        cls, filepath: str, encoding: Optional[str] = 'utf-8'
+            cls, filepath: str, encoding: Optional[str] = 'utf-8'
     ) -> 'Dataset':
         return cls(filepath, encoding, csv.DictReader)
 
@@ -48,13 +50,51 @@ class NERDataset(TaskDataset):
     example_class = NERExample
 
     def to_conll2003(
-        self, tokenizer: Callable[[str], List[str]]
+            self, tokenizer: Callable[[str], List[str]]
     ) -> Iterator[str]:
         for example in self:
             yield from example.to_conll2003(tokenizer)
 
     def to_spacy(
-        self, tokenizer: Callable[[str], List[str]]
+            self, tokenizer: Callable[[str], List[str]]
     ) -> Iterator[dict]:
         for example in self:
             yield from example.to_spacy(tokenizer)
+
+
+class TextClassificationDataset(TaskDataset):
+    example_class = TextClassificationExample
+
+    def __init__(self, filepath: str, labels_filepath: str, encoding: Optional[
+        str] = 'utf-8',
+                 transformation_func: Optional[
+                     Callable[[TextIO], Iterable[Any]]] = None) -> None:
+        """Dataset for converting text classification annotations
+
+        Args:
+            filepath: path to exported annotations
+            labels_filepath: path to exported label metadata
+            encoding: encoding of the annotation file
+            transformation_func: additional tranformation function
+        """
+        super().__init__(filepath, encoding, transformation_func)
+        self.labels_filepath = labels_filepath
+        self.labels = read_labels(self.labels_filepath)
+
+    @classmethod
+    def from_jsonl(
+            cls, filepath: str, encoding: Optional[str] = 'utf-8', **kwargs
+    ) -> 'Dataset':
+        labels_filepath = kwargs.get('labels_filepath')
+        return cls(filepath, labels_filepath, encoding, lambda f: map(
+            json.loads, f))
+
+    def __iter__(self) -> Iterator[Example]:
+        for raw in super(TextClassificationDataset, self).__iter__():
+            example = self.example_class(raw, self.labels)
+            example.is_valid(raise_exception=True)
+            yield example
+
+    def to_fasttext(self) -> Iterator[str]:
+        for example in self:
+            yield example.to_fasttext()
diff --git a/doccano_transformer/examples.py b/doccano_transformer/examples.py
@@ -1,5 +1,6 @@
+import os
 from collections import defaultdict
-from typing import Callable, Iterator, List, Optional
+from typing import Callable, Dict, Iterator, List, Optional
 
 from spacy.gold import biluo_tags_from_offsets
 
@@ -58,7 +59,7 @@ def is_valid(self, raise_exception: Optional[bool] = True) -> bool:
         return True
 
     def to_conll2003(
-        self, tokenizer: Callable[[str], List[str]]
+            self, tokenizer: Callable[[str], List[str]]
     ) -> Iterator[dict]:
         all_tokens, all_token_offsets = self.get_tokens_and_token_offsets(
             tokenizer)
@@ -79,7 +80,7 @@ def to_conll2003(
             yield {'user': user, 'data': ''.join(lines)}
 
     def to_spacy(
-        self, tokenizer: Callable[[str], List[str]]
+            self, tokenizer: Callable[[str], List[str]]
     ) -> Iterator[dict]:
         all_tokens, all_token_offsets = self.get_tokens_and_token_offsets(
             tokenizer)
@@ -101,11 +102,38 @@ def to_spacy(
                 tags = biluo_tags_from_offsets(tokens, label)
                 tokens_for_spacy = []
                 for i, (token, tag, offset) in enumerate(
-                    zip(tokens, tags, offsets)
+                        zip(tokens, tags, offsets)
                 ):
                     tokens_for_spacy.append(
                         {'id': i, 'orth': str(token), 'ner': tag}
                     )
                 sentences.append({'tokens': tokens_for_spacy})
             data['sentences'] = sentences
             yield {'user': user, 'data': {'id': self.id, 'paragraphs': [data]}}
+
+
+class TextClassificationExample(Example):
+
+    def __init__(self, raw, labels: Dict) -> None:
+        """Example class for text classification projects
+
+        Args:
+            raw: example in a for of dict
+            labels: mapping of labels from id to text
+        """
+        self.raw = raw
+        self.labels = labels
+        self.annotations = self.raw['annotations']
+
+    def is_valid(self, raise_exception: Optional[bool] = True) -> None:
+        return True
+
+    def _append_label_text(self, label_id: int) -> str:
+        return f'__label__{self.labels[label_id]} '
+
+    def _create_label_tags(self):
+        return ''.join(self._append_label_text(annotation['label'])
+                       for annotation in self.annotations)
+
+    def to_fasttext(self):
+        return self._create_label_tags() + self.raw['text'] + os.linesep
diff --git a/doccano_transformer/utils.py b/doccano_transformer/utils.py
@@ -1,4 +1,5 @@
-from typing import TYPE_CHECKING, List, Optional, Tuple
+import json
+from typing import TYPE_CHECKING, Dict, List, Optional, Tuple
 
 if TYPE_CHECKING:
     from doccano_transformer.datasets import Dataset
@@ -99,7 +100,7 @@ def __str__(self):
 
 
 def convert_tokens_and_offsets_to_spacy_tokens(
-    tokens: List[str], offsets: List[int]
+        tokens: List[str], offsets: List[int]
 ) -> List[Token]:
     """Convert tokens and offsets to the list of SpaCy compatible object.
 
@@ -120,3 +121,11 @@ def convert_tokens_and_offsets_to_spacy_tokens(
     for i, (token, offset) in enumerate(zip(tokens, offsets)):
         spacy_tokens.append(Token(token, offset, i))
     return spacy_tokens
+
+
+def read_labels(labels_filepath: str) -> Dict:
+    labels_doccano = json.load(open(labels_filepath, mode='r'))
+    labels_mapping = {}
+    for label in labels_doccano:
+        labels_mapping[label['id']] = label['text']
+    return labels_mapping