doc: improve datastream and merge description

samedii · samedii · commit 57cb8f31fdba · 2020-12-01T10:39:00.000+01:00
diff --git a/datastream/dataset.py b/datastream/dataset.py
@@ -35,7 +35,7 @@ class Dataset(BaseModel, Generic[T]):
         ...         cost * 2,
         ...     ))
         ... )
-        >>> print(dataset[2])
+        >>> dataset[2]
         ('banana', 28)
     '''
 
@@ -51,10 +51,11 @@ class Config:
     def from_subscriptable(subscriptable) -> Dataset:
         '''
         Create ``Dataset`` based on subscriptable i.e. implements
-        ``__getitem__`` and ``__len__``. Should only be used for simple
-        examples as a ``Dataset`` created with this method does not support
-        methods that require a source dataframe (i.e. :func:`Dataset.split`
-        and :func:`Dataset.subset`)
+        ``__getitem__`` and ``__len__``.
+
+        Should only be used for simple examples as a ``Dataset`` created with
+        this method does not support methods that require a source dataframe
+        like :func:`Dataset.split` and :func:`Dataset.subset`.
         '''
 
         return (
@@ -328,7 +329,6 @@ def group_split(
             ).items()
         }
 
-
     def with_columns(
         self: Dataset[T], **kwargs: Callable[pd.Dataframe, pd.Series]
     ) -> Dataset[T]:
@@ -405,8 +405,11 @@ def to_concat(dataset_index, inner_index):
     def concat(datasets: List[Dataset]) -> Dataset[R]:
         '''
         Concatenate multiple datasets together so that they behave like a
-        single dataset. Consider using :func:`Datastream.merge` if you have
-        multiple data sources.
+        single dataset.
+
+        Consider using :func:`Datastream.merge` if you have
+        multiple data sources instead as it allows you to control the number
+        of samples from each source in the training batches.
         '''
         from_concat_mapping = Dataset.create_from_concat_mapping(datasets)
 
@@ -440,6 +443,7 @@ def from_combine(index):
     @staticmethod
     def create_to_combine_mapping(datasets):
         cumprod_lengths = np.cumprod(list(map(len, datasets)))
+
         def to_concat(inner_indices):
             return inner_indices[0] + sum([
                 inner_index * cumprod_lengths[i]
@@ -453,7 +457,7 @@ def combine(datasets: List[Dataset]) -> Dataset[Tuple]:
         Zip multiple datasets together so that all combinations of examples
         are possible (i.e. the product) creating tuples like
         ``(example1, example2, ...)``.
-    
+
         The created dataset will not have a dataframe because combined
         datasets are often very long and it is expensive to enumerate them.
         '''
diff --git a/datastream/datastream.py b/datastream/datastream.py
@@ -31,7 +31,9 @@
 class Datastream(BaseModel, Generic[T]):
     '''
     ``Datastream[T]`` combines a ``Dataset[T]`` and a sampler into a stream of
-    examples. By default the samples are drawn without replacement until the
+    examples.
+
+    By default the samples are drawn without replacement until the
     full dataset is exhausted. The proportion of the dataset that should be
     drawn before allowing replacement can be changed with
     :func:`Datastream.sample_proportion`.
@@ -70,7 +72,7 @@ def __init__(
 
     def __len__(self):
         return len(self.sampler)
-    
+
     def __iter__(self):
         return map(self.dataset.__getitem__, iter(self.sampler))
 
@@ -80,17 +82,22 @@ def merge(datastreams_and_ns: Tuple[Union[
         Tuple[Datastream[T], int]
     ], ...]) -> Datastream[T]:
         '''
-        Merge multiple datastreams by interleaving them. Optionally you can
-        define different lengths per ``Datastream``.
-
-        .. highlight:: python
-        .. code-block:: python
-
-            Datastream.merge([
-                (datastream1, 2),
-                (datastream2, 1),
-                (datastream3, 1),
-            ])
+        Creates a merged datastream where samples are drawn one at a time from
+        each underlying datastream (also known as "interleave").
+
+        Optionally you can define the number of drawn samples per
+        ``Datastream``.
+
+        >>> datastream1 = Datastream(Dataset.from_subscriptable([1, 1]))
+        >>> datastream2 = Datastream(Dataset.from_subscriptable([2, 2]))
+        >>> datastream3 = Datastream(Dataset.from_subscriptable([3, 3, 3, 3]))
+        >>> merged_datastream = Datastream.merge([
+        ...     (datastream1, 1),
+        ...     (datastream2, 1),
+        ...     (datastream3, 2),
+        ... ])
+        >>> list(merged_datastream)
+        [1, 2, 3, 3, 1, 2, 3, 3]
         '''
         datastreams_and_ns = [
             x if type(x) is tuple else (x, 1)