|
| 1 | +{ |
| 2 | + "cells": [ |
| 3 | + { |
| 4 | + "cell_type": "markdown", |
| 5 | + "metadata": {}, |
| 6 | + "source": [ |
| 7 | + "PCA uses linear transfomation to reduce the dimensions of our dataset. And as a bonus, because PCA transforms the data by exploring ‘linear’ covariances between the variables it can also be used as an anomaly detector. Because any recipe that doesn’t follow the ‘structure’ of the initial dataset, won’t transform well.\n", |
| 8 | + "\n", |
| 9 | + "6714 ingredients -> 6714 columns. When one ingredient is present in a recipe, its column goes to 1. All the rest stays as a 0. In average only 10 of those columns will be ‘active’ in each row. This code will create the “transformer”, that will get an ingredient and output its vector representation. Following code gives us an encoder that will get a ingredient (string) as input and output its vector representation. The final vector containing all the recipe’s ingredients will be the result of a ‘logical or’ on every one of those ingredient vectors." |
| 10 | + ] |
| 11 | + }, |
| 12 | + { |
| 13 | + "cell_type": "code", |
| 14 | + "execution_count": 1, |
| 15 | + "metadata": {}, |
| 16 | + "outputs": [], |
| 17 | + "source": [ |
| 18 | + "from sklearn.preprocessing import LabelEncoder\n", |
| 19 | + "from sklearn.preprocessing import OneHotEncoder\n", |
| 20 | + "from numpy import array\n", |
| 21 | + "import json\n", |
| 22 | + "f = open('train.json', 'r')\n", |
| 23 | + "recipes_train_txt = f.read()\n", |
| 24 | + "recipes_train_json = json.loads(recipes_train_txt)\n", |
| 25 | + "#get list of ingredients\n", |
| 26 | + "ingredients = set()\n", |
| 27 | + "ingredients_matrix = []\n", |
| 28 | + "for recipe in recipes_train_json:\n", |
| 29 | + " ingredients_matrix.append(recipe[\"ingredients\"])\n", |
| 30 | + " for ingred in recipe[\"ingredients\"]:\n", |
| 31 | + " ingredients.add(ingred)\n", |
| 32 | + "ingredients = list(ingredients)\n", |
| 33 | + "ingredients.sort() #it made my life easier to have it sorted when i needed to check what is what in the encoded vector\n", |
| 34 | + "values = array(ingredients) \n", |
| 35 | + "label_encoder = LabelEncoder()\n", |
| 36 | + "#gives a unique int value for each string ingredient, and saves the #mapping. you need that for the encoder. something like: \n", |
| 37 | + "#['banana'] -> [1]\n", |
| 38 | + "integer_encoded = label_encoder.fit_transform(values) \n", |
| 39 | + "\n", |
| 40 | + "onehot_encoder = OneHotEncoder(sparse=False)\n", |
| 41 | + "integer_encoded = integer_encoded.reshape(len(integer_encoded), 1)\n", |
| 42 | + "#here you encode something like : [2] -> [0,1,0,0,...]\n", |
| 43 | + "onehot_encoded = onehot_encoder.fit_transform(integer_encoded)\n", |
| 44 | + "def transform_value(s):\n", |
| 45 | + " \n", |
| 46 | + " l = array([s])\n", |
| 47 | + " integer_encoded = label_encoder.transform(l)\n", |
| 48 | + " integer_encoded = integer_encoded.reshape(len(integer_encoded), 1)\n", |
| 49 | + " onehot_encoded = onehot_encoder.transform(integer_encoded)\n", |
| 50 | + " \n", |
| 51 | + " return onehot_encoded[0]" |
| 52 | + ] |
| 53 | + }, |
| 54 | + { |
| 55 | + "cell_type": "code", |
| 56 | + "execution_count": 1, |
| 57 | + "metadata": {}, |
| 58 | + "outputs": [ |
| 59 | + { |
| 60 | + "name": "stdout", |
| 61 | + "output_type": "stream", |
| 62 | + "text": [ |
| 63 | + "Init Plugin\n", |
| 64 | + "Init Graph Optimizer\n", |
| 65 | + "Init Kernel\n" |
| 66 | + ] |
| 67 | + } |
| 68 | + ], |
| 69 | + "source": [ |
| 70 | + "import tensorflow as tf\n", |
| 71 | + "from tensorflow.keras.layers import BatchNormalization\n", |
| 72 | + "from tensorflow.keras.layers import LeakyReLU\n", |
| 73 | + "from tensorflow.keras.layers import Activation\n", |
| 74 | + "from tensorflow.keras.layers import Flatten\n", |
| 75 | + "from tensorflow.keras.layers import Dense\n", |
| 76 | + "from tensorflow.keras.layers import Reshape\n", |
| 77 | + "from tensorflow.keras.layers import Input\n", |
| 78 | + "from tensorflow.keras.models import Model\n", |
| 79 | + "from tensorflow.keras import backend as K\n", |
| 80 | + "import numpy as np" |
| 81 | + ] |
| 82 | + }, |
| 83 | + { |
| 84 | + "cell_type": "code", |
| 85 | + "execution_count": null, |
| 86 | + "metadata": {}, |
| 87 | + "outputs": [], |
| 88 | + "source": [ |
| 89 | + "num_input = len(X_Total[0])\n", |
| 90 | + "num_hidden_l = 700\n", |
| 91 | + "X = tf.placeholder(“float”, [None, num_input])\n", |
| 92 | + "w_encoder_h1 = tf.Variable(tf.random_normal([num_input, num_hidden_l])\n", |
| 93 | + "w_decoder_h2 = tf.Variable(tf.random_normal([num_hidden_l, num_input]))\n", |
| 94 | + "encoder_b1 = tf.Variable(tf.random_normal([num_hidden_l]))\n", |
| 95 | + "decoder_b2 = tf.Variable(tf.random_normal([num_input]))\n", |
| 96 | + "layer_1 = tf.nn.sigmoid(tf.add(tf.matmul(X, w_encoder_h1),\n", |
| 97 | + " encoder_b1))\n", |
| 98 | + "layer_2 = tf.nn.sigmoid(tf.add(tf.matmul(layer_1, w_decoder_h2),\n", |
| 99 | + " decoder_b2))\n", |
| 100 | + "# Prediction\n", |
| 101 | + "y_pred = layer_2\n", |
| 102 | + "# Targets (Labels) are the input data.\n", |
| 103 | + "y_true = X\n", |
| 104 | + "# Define loss and optimizer, minimize the squared error\n", |
| 105 | + "loss = tf.reduce_mean(tf.pow(y_true - y_pred, 2))\n", |
| 106 | + "optimizer = tf.train.RMSPropOptimizer(learning_rate).minimize(loss)" |
| 107 | + ] |
| 108 | + } |
| 109 | + ], |
| 110 | + "metadata": { |
| 111 | + "interpreter": { |
| 112 | + "hash": "79f4630616981068147ecb693f55d51ab12fab43ffc02db62e4992b7ed83fc2b" |
| 113 | + }, |
| 114 | + "kernelspec": { |
| 115 | + "display_name": "Python 3.8.10 64-bit ('tf2.5': conda)", |
| 116 | + "name": "python3" |
| 117 | + }, |
| 118 | + "language_info": { |
| 119 | + "codemirror_mode": { |
| 120 | + "name": "ipython", |
| 121 | + "version": 3 |
| 122 | + }, |
| 123 | + "file_extension": ".py", |
| 124 | + "mimetype": "text/x-python", |
| 125 | + "name": "python", |
| 126 | + "nbconvert_exporter": "python", |
| 127 | + "pygments_lexer": "ipython3", |
| 128 | + "version": "3.8.10" |
| 129 | + }, |
| 130 | + "orig_nbformat": 4 |
| 131 | + }, |
| 132 | + "nbformat": 4, |
| 133 | + "nbformat_minor": 2 |
| 134 | +} |
0 commit comments