Merge pull request #7 from joachimwolff/dev

Dev
joachimwolff · Feb 24, 2020 · 9ed5d0f · 9ed5d0f
2 parents 017a067 + 028ea19
commit 9ed5d0f
Show file tree

Hide file tree

Showing 34 changed files with 381 additions and 217 deletions.
diff --git a/setup.py b/setup.py
@@ -1,8 +1,10 @@
 #! /usr/bin/python
-# Copyright 2016 Joachim Wolff
-# For license see file LICENSE
+# Copyright 2016, 2017, 2018, 2019, 2020 Joachim Wolff
+# PhD Thesis
+#
+# Copyright 2015, 2016 Joachim Wolff
 # Master Thesis
-# Tutors: Fabrizio Costa, Milad Miladi
+# Tutor: Fabrizio Costa
 # Winter semester 2015/2016
 #
 # Chair of Bioinformatics
@@ -26,11 +28,11 @@
 import time 
 __author__ = "Joachim Wolff"
 __contact__ = "[email protected]"
-__copyright__ = "Copyright 2019, Joachim Wolff"
+__copyright__ = "Copyright 2020, Joachim Wolff"
 __credits__ = ["Milad Miladi", "Fabrizio Costa"]
 __license__ = "MIT"
 __date__ = time.strftime("%d/%m/%Y")
-__version__ = "0.4"
+__version__ = "0.5"
 
 from setuptools import setup, find_packages
 import platform
@@ -125,11 +127,6 @@ def locate_cuda():
 CUDA = locate_cuda()
 
 
-# Obtain the numpy include directory.  This logic works across numpy versions.
-# try:
-#     numpy_include = numpy.get_include()
-# except AttributeError:
-#     numpy_include = numpy.get_numpy_include()
 def customize_compiler_gcc(self):
     """inject deep into distutils to customize how the dispatch
     to gcc/nvcc works.
@@ -151,13 +148,6 @@ def customize_compiler_gcc(self):
     # object but distutils doesn't have the ability to change compilers
     # based on source extension: we add it.
     def _compile(obj, src, ext, cc_args, extra_postargs, pp_opts):
-        # if os.path.splitext(src)[1] == '.cu':
-        #     # use the cuda for .cu files
-        #     self.set_executable('compiler_so', CUDA['nvcc'])
-        #     # use only a subset of the extra_postargs, which are 1-1 translated
-        #     # from the extra_compile_args in the Extension class
-        #     postargs = extra_postargs['nvcc']
-        # else:
         postargs = extra_postargs['gcc']
 
         super(obj, src, ext, cc_args, postargs, pp_opts)
@@ -168,15 +158,6 @@ def _compile(obj, src, ext, cc_args, extra_postargs, pp_opts):
     self._compile = _compile
 
 
-# run the customize_compiler
-# class custom_build_ext_gcc(build_ext):
-#     def build_extensions(self):
-#         customize_compiler_for_gcc(self.compiler)
-#         build_ext.build_extensions(self)
-
-
-
-
 
 def customize_compiler_for_nvcc(self):
     """inject deep into distutils to customize how the dispatch
@@ -245,11 +226,9 @@ def build_extensions(self):
             "scipy >= 1.3.0",
             "scikit-learn >= 0.21.0",],
             ext_modules = [module1],
-            # cmdclass={'build_ext': custom_build_ext_gcc},
             packages=['sparse_neighbors_search',
                         'sparse_neighbors_search.neighbors',
                         'sparse_neighbors_search.cluster',
-                        #  'bioinf.computation',
                     ],
             platforms = "Linux",
             version = __version__
@@ -258,9 +237,6 @@ def build_extensions(self):
     print ("CUDA found on system. Installing MinHash with CUDA-Support.")
     sources_list.extend(['sparse_neighbors_search/computation/kernel.cu', 'sparse_neighbors_search/computation/inverseIndexCuda.cu', 'sparse_neighbors_search/computation/nearestNeighborsCuda.cu'])
     depends_list.extend(['sparse_neighbors_search/computation/typeDefinitionsCuda.h', 'sparse_neighbors_search/computation/kernel.h', 'sparse_neighbors_search/computation/inverseIndexCuda.h', 'sparse_neighbors_search/computation/nearestNeighborsCuda.h', ])
-    # Extension('_nearestNeighbors', sources = sources_list, depends = depends_list,
-    #      define_macros=[('OPENMP', None)], extra_link_args = ["-lm", "-lrt","-lgomp"], 
-    #     extra_compile_args=["-fopenmp", "-O3", "-std=c++11"])
     if openmp:
         ext = Extension('_nearestNeighbors',
                     sources = sources_list, depends = depends_list,
@@ -272,12 +248,10 @@ def build_extensions(self):
                     # we're only going to use certain compiler args with nvcc and not with gcc
                     # the implementation of this trick is in customize_compiler() below
                     define_macros=[('OPENMP', None), ('CUDA', None)],
-                    # extra_link_args={'gcc': ["-lm", "-lrt","-lgomp"], 
-                    #                   'nvcc' :[]  },
                     extra_link_args=["-lm", "-lrt","-lgomp"],
                     extra_compile_args={'gcc': ["-fopenmp", "-O3", "-std=c++11", "-funroll-loops", "-msse4.1"],
                                         'nvcc': ['-arch=sm_60', '--ptxas-options=-v', '-c', '--compiler-options', "'-fPIC'", '-std=c++11' ]},
-                    include_dirs = [CUDA['include'], 'src'],#, '/home/joachim/Software/cub-1.5.1'],
+                    include_dirs = [CUDA['include'], 'src'],
                     platforms = "Linux, Mac OS X"
                     )
     else:
@@ -291,17 +265,14 @@ def build_extensions(self):
                     # we're only going to use certain compiler args with nvcc and not with gcc
                     # the implementation of this trick is in customize_compiler() below
                     define_macros=[('CUDA', None)],
-                    # extra_link_args={'gcc': ["-lm", "-lrt","-lgomp"], 
-                    #                   'nvcc' :[]  },
                     extra_link_args=["-lm", "-lrt","-lgomp"],
                     extra_compile_args={'gcc': ["-O3", "-std=c++11", "-msse4.1"],
                                         'nvcc': ['-arch=sm_60', '--ptxas-options=-v', '-c', '--compiler-options', "'-fPIC'", '-std=c++11' ]},
-                    include_dirs = [CUDA['include'], 'src'],#, '/home/joachim/Software/cub-1.5.1'],
+                    include_dirs = [CUDA['include'], 'src'],
                     platforms = "Linux, Mac OS X"
                     )
 
     setup(name='sparse_neighbors_search',
-        # random metadata. there's more you can supploy
         author='Joachim Wolff',
         ext_modules = [ext],
 
@@ -319,7 +290,6 @@ def build_extensions(self):
         "numpy >= 1.17.0",
         "scipy >= 1.3.0",
         "scikit-learn >= 0.21.0",],
-        # ext_modules = [module1],
         packages=['sparse_neighbors_search',
                     'sparse_neighbors_search.neighbors',
                     'sparse_neighbors_search.cluster',

diff --git a/sparse_neighbors_search/__init__.py b/sparse_neighbors_search/__init__.py
@@ -1,7 +1,10 @@
-# Copyright 2015 Joachim Wolff
-# Master Project
-# Tutors: Milad Miladi, Fabrizio Costa
-# Summer semester 2015
+# Copyright 2016, 2017, 2018, 2019, 2020 Joachim Wolff
+# PhD Thesis
+#
+# Copyright 2015, 2016 Joachim Wolff
+# Master Thesis
+# Tutor: Fabrizio Costa
+# Winter semester 2015/2016
 #
 # Chair of Bioinformatics
 # Department of Computer Science

diff --git a/sparse_neighbors_search/cluster/minHashClustering.py b/sparse_neighbors_search/cluster/minHashClustering.py
@@ -1,6 +1,9 @@
-# Copyright 2015 Joachim Wolff
+# Copyright 2016, 2017, 2018, 2019, 2020 Joachim Wolff
+# PhD Thesis
+#
+# Copyright 2015, 2016 Joachim Wolff
 # Master Thesis
-# Tutors: Milad Miladi, Fabrizio Costa
+# Tutor: Fabrizio Costa
 # Winter semester 2015/2016
 #
 # Chair of Bioinformatics
@@ -10,20 +13,35 @@
 
 from ..neighbors import MinHash
 import numpy as np
+from scipy.sparse import vstack
 
 class MinHashClustering():
     def __init__(self, minHashObject, clusteringObject):
         self._minHashObject = minHashObject
         self._clusteringObject = clusteringObject
-
-    def fit(self, X, y=None):
-        self._minHashObject.fit(X)
-        precomputed_graph = self._minHashObject.kneighbors_graph(mode='distance')
-        self._clusteringObject.fit(precomputed_graph)
+        self._precomputed_graph = None
+    def fit(self, X, y=None, pSaveMemory=None):
+        if pSaveMemory is not None and pSaveMemory > 0:
+            if pSaveMemory > 1:
+                pSaveMemory = 1
+            number_of_elements = X.shape[0]
+            batch_size = int(np.floor(number_of_elements * pSaveMemory))
+            if batch_size < 1:
+                batch_size = 1
+            self._minHashObject.fit(X[0:batch_size, :])
+            if batch_size < number_of_elements:
+                for i in range(batch_size, X.shape[0], batch_size):
+                    self._minHashObject.partial_fit(X[i:i+batch_size, :])
+        else:
+            self._minHashObject.fit(X)
+        self._precomputed_graph = self._minHashObject.kneighbors_graph(mode='distance')
+        self._clusteringObject.fit(self._precomputed_graph)
 
-    def fit_predict(self, X, y=None):
-        self.fit(X, y)
-        return self.predict(X, y)
+    def fit_predict(self, X, y=None, pSaveMemory=None):
+
+        self.fit(X, y, pSaveMemory=pSaveMemory)
+
+        return self.predict(self._precomputed_graph, y)
 
     def predict(self, X, y=None):
         if hasattr(self._clusteringObject, 'labels_'):

diff --git a/sparse_neighbors_search/cluster/minHashDBSCAN.py b/sparse_neighbors_search/cluster/minHashDBSCAN.py
@@ -1,4 +1,7 @@
-# Copyright 2015 Joachim Wolff
+# Copyright 2016, 2017, 2018, 2019, 2020 Joachim Wolff
+# PhD Thesis
+#
+# Copyright 2015, 2016 Joachim Wolff
 # Master Thesis
 # Tutor: Fabrizio Costa
 # Winter semester 2015/2016
@@ -43,8 +46,9 @@ def __init__(self, eps=0.5, min_samples=5,
         self._dbscan = DBSCAN(eps=self.eps, min_samples=min_samples, metric='precomputed',
                 algorithm=self.algorithm, leaf_size=self.leaf_size, p=self.p)
         self.labels_ = None
+        self._precomputed_graph = None
         # only for compatible issues
-    def fit(self, X, y=None):
+    def fit(self, X, y=None, pSaveMemory=None):
         minHashNeighbors = MinHash(n_neighbors = self.n_neighbors, 
         radius = self.radius, fast = self.fast,
         number_of_hash_functions = self.number_of_hash_functions,
@@ -54,10 +58,26 @@ def fit(self, X, y=None):
         excess_factor = self.excess_factor,
         number_of_cores = self.number_of_cores,
         chunk_size = self.chunk_size, similarity=False)
-        minHashNeighbors.fit(X, y)
-        graph_result = minHashNeighbors.kneighbors_graph(mode='distance')
-        self._dbscan.fit(graph_result)
+
+        if pSaveMemory is not None and pSaveMemory > 0:
+            if pSaveMemory > 1:
+                pSaveMemory = 1
+            number_of_elements = X.shape[0]
+            batch_size = int(np.floor(number_of_elements * pSaveMemory))
+            if batch_size < 1:
+                batch_size = 1
+            minHashNeighbors.fit(X[0:batch_size, :])
+            if batch_size < number_of_elements:
+                for i in range(batch_size, X.shape[0], batch_size):
+                    minHashNeighbors.partial_fit(X[i:i+batch_size, :])
+        else:
+            minHashNeighbors.fit(X)
+
+
+        # minHashNeighbors.fit(X, y)
+        self._precomputed_graph = minHashNeighbors.kneighbors_graph(mode='distance')
+        self._dbscan.fit(self._precomputed_graph)
         self.labels_ = self._dbscan.labels_
-    def fit_predict(self, X, y=None):
-        self.fit(X, y)
+    def fit_predict(self, X, y=None, pSaveMemory=None):
+        self.fit(X, y, pSaveMemory=None)
         return self.labels_
diff --git a/sparse_neighbors_search/cluster/minHashSpectralClustering.py b/sparse_neighbors_search/cluster/minHashSpectralClustering.py
@@ -1,6 +1,9 @@
-# Copyright 2015 Joachim Wolff
+# Copyright 2016, 2017, 2018, 2019, 2020 Joachim Wolff
+# PhD Thesis
+#
+# Copyright 2015, 2016 Joachim Wolff
 # Master Thesis
-# Tutors: Milad Miladi, Fabrizio Costa
+# Tutor: Fabrizio Costa
 # Winter semester 2015/2016
 #
 # Chair of Bioinformatics
@@ -58,7 +61,7 @@ def __init__(self, n_clusters=8, eigen_solver=None,
                                                 kernel_params = self.kernel_params)
         # only for compatible issues
         self.labels_ = None
-    def fit(self, X, y=None):
+    def fit(self, X, y=None, pSaveMemory=None):
         minHashNeighbors = MinHash(n_neighbors = self.n_neighbors, 
         radius = self.radius, fast = self.fast,
         number_of_hash_functions = self.number_of_hash_functions,
@@ -68,11 +71,25 @@ def fit(self, X, y=None):
         excess_factor = self.excess_factor,
         number_of_cores = self.number_of_cores,
         chunk_size = self.chunk_size, similarity=True)
-        minHashNeighbors.fit(X, y)
+
+        if pSaveMemory is not None and pSaveMemory > 0:
+            if pSaveMemory > 1:
+                pSaveMemory = 1
+            number_of_elements = X.shape[0]
+            batch_size = int(np.floor(number_of_elements * pSaveMemory))
+            if batch_size < 1:
+                batch_size = 1
+            minHashNeighbors.fit(X[0:batch_size, :])
+            if batch_size < number_of_elements:
+                for i in range(batch_size, X.shape[0], batch_size):
+                    minHashNeighbors.partial_fit(X[i:i+batch_size, :])
+        else:
+            minHashNeighbors.fit(X)
+
         graph_result = minHashNeighbors.kneighbors_graph(mode='distance')
         self._spectralClustering.fit(graph_result)
         self.labels_ = self._spectralClustering.labels_
-    def fit_predict(self, X, y=None):
+    def fit_predict(self, X, y=None, pSaveMemory=None):
         self.fit(X, y)
 
         return self._spectralClustering.labels_
diff --git a/sparse_neighbors_search/computation/hash.h b/sparse_neighbors_search/computation/hash.h
@@ -1,3 +1,18 @@
+/**
+ Copyright 2016, 2017, 2018, 2019, 2020 Joachim Wolff
+ PhD Thesis
+
+ Copyright 2015, 2016 Joachim Wolff
+ Master Thesis
+ Tutor: Fabrizio Costa
+ Winter semester 2015/2016
+
+ Chair of Bioinformatics
+ Department of Computer Science
+ Faculty of Engineering
+ Albert-Ludwigs-University Freiburg im Breisgau
+**/
+
 #include "typeDefinitions.h"
 #include "sseExtension.h"