use gzip for mnist

rasbt · rasbt · commit d0a5dc1cc566 · 2017-07-29T23:13:07.000-04:00
diff --git a/.gitignore b/.gitignore
@@ -2,6 +2,11 @@ docs/equations/*.aux
 docs/equations/*.log
 docs/equations/*.out
 docs/equations/*.synctex.gz
+code/ch12/mnist
+code/datasets/mnist/t10k-images-idx3-ubyte
+code/datasets/mnist/t10k-labels-idx1-ubyte
+code/datasets/mnist/train-images-idx3-ubyte
+code/datasets/mnist/train-labels-idx1-ubyte
 
 .ipynb_checkpoints
 .DS_Store
diff --git a/code/ch12/ch12.ipynb b/code/ch12/ch12.ipynb
@@ -42,14 +42,14 @@
      "output_type": "stream",
      "text": [
       "Sebastian Raschka \n",
-      "last updated: 2016-09-29 \n",
+      "last updated: 2017-07-29 \n",
       "\n",
-      "CPython 3.5.2\n",
-      "IPython 5.1.0\n",
+      "CPython 3.6.1\n",
+      "IPython 6.0.0\n",
       "\n",
-      "numpy 1.11.1\n",
-      "scipy 0.18.1\n",
-      "matplotlib 1.5.1\n"
+      "numpy 1.13.1\n",
+      "scipy 0.19.1\n",
+      "matplotlib 2.0.2\n"
      ]
     }
    ],
@@ -108,8 +108,10 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 3,
-   "metadata": {},
+   "execution_count": 2,
+   "metadata": {
+    "collapsed": true
+   },
    "outputs": [],
    "source": [
     "from IPython.display import Image\n",
@@ -139,7 +141,7 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 4,
+   "execution_count": 3,
    "metadata": {},
    "outputs": [
     {
@@ -149,7 +151,7 @@
        "<IPython.core.display.Image object>"
       ]
      },
-     "execution_count": 4,
+     "execution_count": 3,
      "metadata": {
       "image/png": {
        "width": 600
@@ -179,7 +181,7 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 5,
+   "execution_count": 4,
    "metadata": {},
    "outputs": [
     {
@@ -189,7 +191,7 @@
        "<IPython.core.display.Image object>"
       ]
      },
-     "execution_count": 5,
+     "execution_count": 4,
      "metadata": {
       "image/png": {
        "width": 400
@@ -204,7 +206,7 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 6,
+   "execution_count": 5,
    "metadata": {},
    "outputs": [
     {
@@ -214,7 +216,7 @@
        "<IPython.core.display.Image object>"
       ]
      },
-     "execution_count": 6,
+     "execution_count": 5,
      "metadata": {
       "image/png": {
        "width": 500
@@ -244,7 +246,7 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 7,
+   "execution_count": 6,
    "metadata": {},
    "outputs": [
     {
@@ -254,7 +256,7 @@
        "<IPython.core.display.Image object>"
       ]
      },
-     "execution_count": 7,
+     "execution_count": 6,
      "metadata": {
       "image/png": {
        "width": 500
@@ -316,7 +318,7 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 8,
+   "execution_count": 7,
    "metadata": {
     "collapsed": true
    },
@@ -348,6 +350,50 @@
     "    return images, labels"
    ]
   },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "**Important Note**\n",
+    "\n",
+    "Some readers experienced issues with the `load_mnist` function above as certain decompression tools renamed the files from *-labels-idx1-ubyte* to *-labels.idx1-ubyte*. To avoid this problem altogether, you the modified function above will directly load the dataset from the `gz` archives using Python's `gzip` module."
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 8,
+   "metadata": {
+    "collapsed": true
+   },
+   "outputs": [],
+   "source": [
+    "import os\n",
+    "import struct\n",
+    "import numpy as np\n",
+    "import gzip\n",
+    " \n",
+    "def load_mnist(path, kind='train'):\n",
+    "    \"\"\"Load MNIST data from `path`\"\"\"\n",
+    "    labels_path = os.path.join(path, \n",
+    "                               '%s-labels-idx1-ubyte.gz' % kind)\n",
+    "    images_path = os.path.join(path, \n",
+    "                               '%s-images-idx3-ubyte.gz' % kind)\n",
+    "        \n",
+    "    with gzip.open(labels_path, 'rb') as lbpath:\n",
+    "        lbpath.read(8)\n",
+    "        buffer = lbpath.read()\n",
+    "        labels = np.frombuffer(buffer, dtype=np.uint8)\n",
+    "\n",
+    "    with gzip.open(images_path, 'rb') as imgpath:\n",
+    "        imgpath.read(16)\n",
+    "        buffer = imgpath.read()\n",
+    "        images = np.frombuffer(buffer, \n",
+    "                               dtype=np.uint8).reshape(\n",
+    "            len(labels), 784).astype(np.float64)\n",
+    " \n",
+    "    return images, labels"
+   ]
+  },
   {
    "cell_type": "code",
    "execution_count": 9,
@@ -476,7 +522,9 @@
   {
    "cell_type": "code",
    "execution_count": 13,
-   "metadata": {},
+   "metadata": {
+    "collapsed": true
+   },
    "outputs": [],
    "source": [
     "# np.savetxt('train_img.csv', X_train, fmt='%i', delimiter=',')\n",
@@ -510,7 +558,9 @@
   {
    "cell_type": "code",
    "execution_count": 8,
-   "metadata": {},
+   "metadata": {
+    "collapsed": true
+   },
    "outputs": [],
    "source": [
     "import numpy as np\n",
@@ -921,7 +971,9 @@
   {
    "cell_type": "code",
    "execution_count": 15,
-   "metadata": {},
+   "metadata": {
+    "collapsed": true
+   },
    "outputs": [],
    "source": [
     "nn = NeuralNetMLP(n_output=10, \n",
@@ -996,7 +1048,9 @@
   {
    "cell_type": "code",
    "execution_count": 18,
-   "metadata": {},
+   "metadata": {
+    "collapsed": true
+   },
    "outputs": [],
    "source": [
     "batches = np.array_split(range(len(nn.cost_)), 1000)\n",
@@ -1753,7 +1807,9 @@
   {
    "cell_type": "code",
    "execution_count": 29,
-   "metadata": {},
+   "metadata": {
+    "collapsed": true
+   },
    "outputs": [],
    "source": [
     "nn_check = MLPGradientCheck(n_output=10, \n",
@@ -2011,7 +2067,7 @@
  "metadata": {
   "anaconda-cloud": {},
   "kernelspec": {
-   "display_name": "Python [default]",
+   "display_name": "Python 3",
    "language": "python",
    "name": "python3"
   },
@@ -2025,7 +2081,7 @@
    "name": "python",
    "nbconvert_exporter": "python",
    "pygments_lexer": "ipython3",
-   "version": "3.5.2"
+   "version": "3.6.1"
   }
  },
  "nbformat": 4,