@@ -35,7 +35,7 @@ class Dataset(BaseModel, Generic[T]):
35
35
... cost * 2,
36
36
... ))
37
37
... )
38
- >>> print( dataset[2])
38
+ >>> dataset[2]
39
39
('banana', 28)
40
40
'''
41
41
@@ -51,10 +51,11 @@ class Config:
51
51
def from_subscriptable (subscriptable ) -> Dataset :
52
52
'''
53
53
Create ``Dataset`` based on subscriptable i.e. implements
54
- ``__getitem__`` and ``__len__``. Should only be used for simple
55
- examples as a ``Dataset`` created with this method does not support
56
- methods that require a source dataframe (i.e. :func:`Dataset.split`
57
- and :func:`Dataset.subset`)
54
+ ``__getitem__`` and ``__len__``.
55
+
56
+ Should only be used for simple examples as a ``Dataset`` created with
57
+ this method does not support methods that require a source dataframe
58
+ like :func:`Dataset.split` and :func:`Dataset.subset`.
58
59
'''
59
60
60
61
return (
@@ -328,7 +329,6 @@ def group_split(
328
329
).items ()
329
330
}
330
331
331
-
332
332
def with_columns (
333
333
self : Dataset [T ], ** kwargs : Callable [pd .Dataframe , pd .Series ]
334
334
) -> Dataset [T ]:
@@ -405,8 +405,11 @@ def to_concat(dataset_index, inner_index):
405
405
def concat (datasets : List [Dataset ]) -> Dataset [R ]:
406
406
'''
407
407
Concatenate multiple datasets together so that they behave like a
408
- single dataset. Consider using :func:`Datastream.merge` if you have
409
- multiple data sources.
408
+ single dataset.
409
+
410
+ Consider using :func:`Datastream.merge` if you have
411
+ multiple data sources instead as it allows you to control the number
412
+ of samples from each source in the training batches.
410
413
'''
411
414
from_concat_mapping = Dataset .create_from_concat_mapping (datasets )
412
415
@@ -440,6 +443,7 @@ def from_combine(index):
440
443
@staticmethod
441
444
def create_to_combine_mapping (datasets ):
442
445
cumprod_lengths = np .cumprod (list (map (len , datasets )))
446
+
443
447
def to_concat (inner_indices ):
444
448
return inner_indices [0 ] + sum ([
445
449
inner_index * cumprod_lengths [i ]
@@ -453,7 +457,7 @@ def combine(datasets: List[Dataset]) -> Dataset[Tuple]:
453
457
Zip multiple datasets together so that all combinations of examples
454
458
are possible (i.e. the product) creating tuples like
455
459
``(example1, example2, ...)``.
456
-
460
+
457
461
The created dataset will not have a dataframe because combined
458
462
datasets are often very long and it is expensive to enumerate them.
459
463
'''
0 commit comments