1
1
from __future__ import absolute_import
2
2
3
+ import sys
4
+
3
5
from .proxy import _ItemsResourceProxy , _DownloadableProxyMixin
4
6
5
7
@@ -37,6 +39,34 @@ class Items(_DownloadableProxyMixin, _ItemsResourceProxy):
37
39
'size': 100000,
38
40
}]
39
41
42
+ - retrieve items via a generator of lists. This is most useful in cases
43
+ where the job has a huge amount of items and it needs to be broken down
44
+ into chunks when consumed. This example shows a job with 3 items::
45
+
46
+ >>> gen = job.items.list_iter(chunksize=2)
47
+ >>> next(gen)
48
+ [{'name': 'Item #1'}, {'name': 'Item #2'}]
49
+ >>> next(gen)
50
+ [{'name': 'Item #3'}]
51
+ >>> next(gen)
52
+ Traceback (most recent call last):
53
+ File "<stdin>", line 1, in <module>
54
+ StopIteration
55
+
56
+ - retrieving via meth::`list_iter` also supports the `start` and `count`.
57
+ params. This is useful when you want to only retrieve a subset of items in
58
+ a job. The example below belongs to a job with 10 items::
59
+
60
+ >>> gen = job.items.list_iter(chunksize=2, start=5, count=3)
61
+ >>> next(gen)
62
+ [{'name': 'Item #5'}, {'name': 'Item #6'}]
63
+ >>> next(gen)
64
+ [{'name': 'Item #7'}]
65
+ >>> next(gen)
66
+ Traceback (most recent call last):
67
+ File "<stdin>", line 1, in <module>
68
+ StopIteration
69
+
40
70
- retrieve 1 item with multiple filters::
41
71
42
72
>>> filters = [("size", ">", [30000]), ("size", "<", [40000])]
@@ -59,3 +89,43 @@ def _modify_iter_params(self, params):
59
89
if offset :
60
90
params ['start' ] = '{}/{}' .format (self .key , offset )
61
91
return params
92
+
93
+ def list_iter (self , chunksize = 1000 , * args , ** kwargs ):
94
+ """An alternative interface for reading items by returning them
95
+ as a generator which yields lists of items sized as `chunksize`.
96
+
97
+ This is a convenient method for cases when processing a large amount of
98
+ items from a job isn't ideal in one go due to the large memory needed.
99
+ Instead, this allows you to process it chunk by chunk.
100
+
101
+ You can improve I/O overheads by increasing the chunk value but that
102
+ would also increase the memory consumption.
103
+
104
+ :param chunksize: size of list to be returned per iteration
105
+ :param start: offset to specify the start of the item iteration
106
+ :param count: overall number of items to be returned, which is broken
107
+ down by `chunksize`.
108
+
109
+ :return: an iterator over items, yielding lists of items.
110
+ :rtype: :class:`collections.Iterable`
111
+ """
112
+
113
+ start = kwargs .pop ("start" , 0 )
114
+ count = kwargs .pop ("count" , sys .maxsize )
115
+ processed = 0
116
+
117
+ while True :
118
+ next_key = self .key + "/" + str (start )
119
+ if processed + chunksize > count :
120
+ chunksize = count - processed
121
+ items = [
122
+ item for item in self .iter (
123
+ count = chunksize , start = next_key , * args , ** kwargs )
124
+ ]
125
+ yield items
126
+ processed += len (items )
127
+ start += len (items )
128
+ if processed >= count :
129
+ break
130
+ if len (items ) < chunksize :
131
+ break
0 commit comments