added groups for zscore value holder

wegamekinglc · wegamekinglc · commit 887b66dc448e · 2018-04-29T16:33:39.000+08:00
diff --git a/PyFin/Analysis/CrossSectionValueHolders.pyx b/PyFin/Analysis/CrossSectionValueHolders.pyx
@@ -227,46 +227,46 @@ cdef class CSAverageAdjustedSecurityValueHolder(CrossSectionValueHolder):
 
 
 cdef class CSZScoreSecurityValueHolder(CrossSectionValueHolder):
-    def __init__(self, innerValue):
-        super(CSZScoreSecurityValueHolder, self).__init__(innerValue)
+    def __init__(self, innerValue, groups=None):
+        super(CSZScoreSecurityValueHolder, self).__init__(innerValue, groups)
 
-    @property
-    def value(self):
+    cdef _cal_impl(self):
+        cdef SeriesValues raw_values = self._inner.value
 
-        cdef SeriesValues raw_values
+        if self._group:
+            self.cached = raw_values.zscore(self._group.value)
+        else:
+            self.cached = raw_values.zscore()
+        self.updated = 1
 
+    @property
+    def value(self):
         if self.updated:
             return self.cached
         else:
-            raw_values = self._inner.value
-            self.cached = raw_values.zscore()
-            self.updated = 1
+            self._cal_impl()
             return self.cached
 
-    @cython.cdivision(True)
     cpdef double value_by_name(self, name):
-
-        cdef SeriesValues raw_values
-
         if self.updated:
             return self.cached[name]
         else:
-            raw_values = self._inner.value
-            self.cached = raw_values.zscore()
-            self.updated = 1
+            self._cal_impl()
             return self.cached[name]
 
-    @cython.cdivision(True)
     cpdef SeriesValues value_by_names(self, list names):
-
-        cdef SeriesValues raw_values
-
-        raw_values = self._inner.value_by_names(names)
-        raw_values = raw_values.zscore()
-        return raw_values[names]
+        cdef SeriesValues raw_values = self._inner.value_by_names(names)
+        if self._group:
+            raw_values = raw_values.zscore(self._group.value_by_names(names))
+        else:
+            raw_values = raw_values.zscore()
+        return raw_values
 
     def __str__(self):
-        return "\mathrm{{CSZScore}}({0})".format(str(self._inner))
+        if self._group:
+            return "\mathrm{{CSZscore}}({0}, groups={1})".format(str(self._inner), str(self._group))
+        else:
+            return "\mathrm{{CSZscore}}({0})".format(str(self._inner))
 
 
 cdef class CSResidueSecurityValueHolder(SecurityValueHolder):
diff --git a/PyFin/Analysis/SeriesValues.pxd b/PyFin/Analysis/SeriesValues.pxd
@@ -16,7 +16,7 @@ cdef class SeriesValues(object):
     cpdef SeriesValues mask(self, flags)
     cpdef list index(self)
     cpdef SeriesValues rank(self, SeriesValues groups=*)
-    cpdef SeriesValues zscore(self)
+    cpdef SeriesValues zscore(self, SeriesValues groups=*)
     cpdef SeriesValues unit(self)
 
     cpdef SeriesValues mean(self, SeriesValues groups=*)
diff --git a/PyFin/Analysis/SeriesValues.pyx b/PyFin/Analysis/SeriesValues.pyx
@@ -234,9 +234,30 @@ cdef class SeriesValues(object):
             data[np.isnan(self.values)] = NAN
         return SeriesValues(data, self.name_mapping)
 
-    cpdef SeriesValues zscore(self):
-        cdef np.ndarray[double, ndim=1] data = self.values
-        return SeriesValues((data - nanmean(data)) / nanstd(data), self.name_mapping)
+    cpdef SeriesValues zscore(self, SeriesValues groups=None):
+        cdef np.ndarray[double, ndim=1] data
+        cdef np.ndarray[long long, ndim=1] order
+        cdef np.ndarray[long long, ndim=1] index_diff
+        cdef long long diff_loc
+        cdef long long start = 0
+        cdef np.ndarray[long long, ndim=1] curr_idx
+        cdef np.ndarray[double, ndim=1] values = self.values
+        cdef np.ndarray[double, ndim=1] curr_values
+
+        if groups:
+            data = values.copy()
+            index_diff, order = groupby(groups.values)
+            start = 0
+            for diff_loc in index_diff:
+                curr_idx = order[start:diff_loc + 1]
+                curr_values = self.values[curr_idx]
+                data[curr_idx] = (curr_values - nanmean(curr_values)) / nanstd(curr_values)
+                start = diff_loc + 1
+            data[np.isnan(values)] = NAN
+        else:
+            data = (values - nanmean(values)) / nanstd(values)
+            data[np.isnan(values)] = NAN
+        return SeriesValues(data, self.name_mapping)
 
     cpdef SeriesValues unit(self):
         cdef np.ndarray[double, ndim=1] data = self.values
diff --git a/PyFin/api/Analysis.py b/PyFin/api/Analysis.py
@@ -58,16 +58,16 @@ def CSMean(dependency, groups=None):
     return CSAverageSecurityValueHolder(dependency, groups)
 
 
-def CSMeanAdjusted(dependency):
-    return CSAverageAdjustedSecurityValueHolder(dependency)
+def CSMeanAdjusted(dependency, groups=None):
+    return CSAverageAdjustedSecurityValueHolder(dependency, groups)
 
 
 def CSQuantiles(dependency, groups=None):
     return CSPercentileSecurityValueHolder(dependency, groups)
 
 
-def CSZScore(dependency):
-    return CSZScoreSecurityValueHolder(dependency)
+def CSZScore(dependency, groups=None):
+    return CSZScoreSecurityValueHolder(dependency, groups)
 
 
 def CSRes(left, right):
diff --git a/PyFin/tests/Analysis/testCrossSectionValueHolders.py b/PyFin/tests/Analysis/testCrossSectionValueHolders.py
@@ -234,6 +234,32 @@ def testCSZscoreSecurityValueHolder(self):
 
         np.testing.assert_array_almost_equal(expected, calculated.values)
 
+    def testCSZscoreSecurityValueHolderWithGroups(self):
+        benchmark = SecurityLatestValueHolder(x='close')
+        groups = SecurityLatestValueHolder(x='ind')
+        meanAdjustedHolder = CSZScoreSecurityValueHolder(benchmark, groups)
+
+        for i in range(len(self.datas['aapl']['close'])):
+            data = {'aapl': {Factors.CLOSE: self.datas['aapl'][Factors.CLOSE][i],
+                             Factors.OPEN: self.datas['aapl'][Factors.OPEN][i],
+                             'ind': 1.},
+                    'ibm': {Factors.CLOSE: self.datas['ibm'][Factors.CLOSE][i],
+                            Factors.OPEN: self.datas['ibm'][Factors.OPEN][i],
+                            'ind': 1.},
+                    'goog': {Factors.CLOSE: self.datas['goog'][Factors.CLOSE][i],
+                             Factors.OPEN: self.datas['goog'][Factors.OPEN][i],
+                             'ind': 2.},
+                    'baba': {Factors.CLOSE: self.datas['baba'][Factors.CLOSE][i],
+                             Factors.OPEN: self.datas['baba'][Factors.OPEN][i],
+                             'ind': 2.}}
+            benchmark.push(data)
+            meanAdjustedHolder.push(data)
+            benchmarkValues = benchmark.value
+            groups = {'aapl': 1., 'ibm': 1., 'goog': 2., 'baba': 2.}
+            expected_rank = pd.Series(benchmarkValues.to_dict()).groupby(groups) \
+                .transform(lambda x: (x - x.mean()) / x.std(ddof=0))
+            np.testing.assert_array_almost_equal(expected_rank, meanAdjustedHolder.value.values)
+
     def testCSZResidueSecurityValueHolder(self):
         y = SecurityLatestValueHolder(x='close')
         x = SecurityLatestValueHolder(x='open')