diff --git a/baking_project-2.ipynb b/baking_project-2.ipynb deleted file mode 100644 index dd734e5..0000000 --- a/baking_project-2.ipynb +++ /dev/null @@ -1,3983 +0,0 @@ -{ - "cells": [ - { - "cell_type": "markdown", - "metadata": { - "id": "YzYu-XRFJrhk" - }, - "source": [ - "### Dependencies \n" - ] - }, - { - "cell_type": "code", - "execution_count": 1, - "metadata": { - "id": "m7v7WNo4dNqw" - }, - "outputs": [], - "source": [ - "import os\n", - "import math\n", - "import numpy as np\n", - "import pandas as pd\n", - "import tensorflow as tf\n", - "import matplotlib.pyplot as plt\n", - "import json\n", - "import platform\n", - "import time\n", - "import pathlib\n" - ] - }, - { - "cell_type": "code", - "execution_count": 2, - "metadata": { - "colab": { - "base_uri": "https://localhost:8080/" - }, - "id": "UAqSrdVLhyDp", - "outputId": "e6c5278d-9329-4a68-fe1c-f4b12542c55a" - }, - "outputs": [ - { - "output_type": "stream", - "name": "stdout", - "text": [ - "Python version: 3.7.12\n", - "Tensorflow version: 2.7.0\n", - "Keras version: 2.7.0\n" - ] - } - ], - "source": [ - "print('Python version:', platform.python_version())\n", - "print('Tensorflow version:', tf.__version__)\n", - "print('Keras version:', tf.keras.__version__)" - ] - }, - { - "cell_type": "markdown", - "metadata": { - "id": "AHnTupAX1I-a" - }, - "source": [ - "### Uploading the initial data" - ] - }, - { - "cell_type": "markdown", - "metadata": { - "id": "ohSCowjg2PBI" - }, - "source": [ - "We will upload the data from Google drive, however the data scourse is https://www.kaggle.com/shuyangli94/food-com-recipes-and-user-interactions" - ] - }, - { - "cell_type": "code", - "execution_count": 3, - "metadata": { - "colab": { - "base_uri": "https://localhost:8080/" - }, - "id": "8Zu_JHw5iTof", - "outputId": "07cbbe2c-28a1-4db2-93bc-afc78e202651" - }, - "outputs": [ - { - "output_type": "stream", - "name": "stdout", - "text": [ - "Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount(\"/content/drive\", force_remount=True).\n" - ] - } - ], - "source": [ - "from google.colab import drive\n", - "drive.mount('/content/drive')" - ] - }, - { - "cell_type": "code", - "execution_count": 4, - "metadata": { - "id": "x_63QbWisXFb" - }, - "outputs": [], - "source": [ - "path = '/content/drive/MyDrive/data/RAW_recipes.csv'\n", - "data = pd.read_csv(path)" - ] - }, - { - "cell_type": "code", - "execution_count": 5, - "metadata": { - "colab": { - "base_uri": "https://localhost:8080/", - "height": 613 - }, - "id": "bjR8EHDMuhSN", - "outputId": "c6f51539-445b-44b3-f17b-cd0140da8d3c" - }, - "outputs": [ - { - "output_type": "execute_result", - "data": { - "text/html": [ - "\n", - "
\n", - "
\n", - "
\n", - "\n", - "\n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - "
nameidminutescontributor_idsubmittedtagsnutritionn_stepsstepsdescriptioningredientsn_ingredients
0arriba baked winter squash mexican style13773955478922005-09-16['60-minutes-or-less', 'time-to-make', 'course...[51.5, 0.0, 13.0, 0.0, 2.0, 0.0, 4.0]11['make a choice and proceed with recipe', 'dep...autumn is my favorite time of year to cook! th...['winter squash', 'mexican seasoning', 'mixed ...7
1a bit different breakfast pizza3149030262782002-06-17['30-minutes-or-less', 'time-to-make', 'course...[173.4, 18.0, 0.0, 17.0, 22.0, 35.0, 1.0]9['preheat oven to 425 degrees f', 'press dough...this recipe calls for the crust to be prebaked...['prepared pizza crust', 'sausage patty', 'egg...6
2all in the kitchen chili1121401301965862005-02-25['time-to-make', 'course', 'preparation', 'mai...[269.8, 22.0, 32.0, 48.0, 39.0, 27.0, 5.0]6['brown ground beef in large pot', 'add choppe...this modified version of 'mom's' chili was a h...['ground beef', 'yellow onions', 'diced tomato...13
3alouette potatoes5938945685852003-04-14['60-minutes-or-less', 'time-to-make', 'course...[368.1, 17.0, 10.0, 2.0, 14.0, 8.0, 20.0]11['place potatoes in a large pot of lightly sal...this is a super easy, great tasting, make ahea...['spreadable cheese with garlic and herbs', 'n...11
4amish tomato ketchup for canning44061190417062002-10-25['weeknight', 'time-to-make', 'course', 'main-...[352.9, 1.0, 337.0, 23.0, 3.0, 0.0, 28.0]5['mix all ingredients& boil for 2 1 / 2 hours ...my dh's amish mother raised him on this recipe...['tomato juice', 'apple cider vinegar', 'sugar...8
\n", - "
\n", - " \n", - " \n", - " \n", - "\n", - " \n", - "
\n", - "
\n", - " " - ], - "text/plain": [ - " name ... n_ingredients\n", - "0 arriba baked winter squash mexican style ... 7\n", - "1 a bit different breakfast pizza ... 6\n", - "2 all in the kitchen chili ... 13\n", - "3 alouette potatoes ... 11\n", - "4 amish tomato ketchup for canning ... 8\n", - "\n", - "[5 rows x 12 columns]" - ] - }, - "metadata": {}, - "execution_count": 5 - } - ], - "source": [ - "data.head()" - ] - }, - { - "cell_type": "code", - "execution_count": 6, - "metadata": { - "colab": { - "base_uri": "https://localhost:8080/" - }, - "id": "VXfsMJRKu3L1", - "outputId": "f86ca2ca-23ef-4294-ec80-50a5fb7a0db8" - }, - "outputs": [ - { - "output_type": "execute_result", - "data": { - "text/plain": [ - "(231637, 12)" - ] - }, - "metadata": {}, - "execution_count": 6 - } - ], - "source": [ - "data.shape" - ] - }, - { - "cell_type": "code", - "execution_count": 7, - "metadata": { - "colab": { - "base_uri": "https://localhost:8080/" - }, - "id": "o6grNAOj7RkF", - "outputId": "db169565-987b-4734-e182-768553f7b360" - }, - "outputs": [ - { - "output_type": "stream", - "name": "stdout", - "text": [ - "name object\n", - "id int64\n", - "minutes int64\n", - "contributor_id int64\n", - "submitted object\n", - "tags object\n", - "nutrition object\n", - "n_steps int64\n", - "steps object\n", - "description object\n", - "ingredients object\n", - "n_ingredients int64\n", - "dtype: object\n" - ] - } - ], - "source": [ - "print(data.dtypes)" - ] - }, - { - "cell_type": "markdown", - "metadata": { - "id": "YQ6KkwnW0_7k" - }, - "source": [ - "### Preprocessing the dataset" - ] - }, - { - "cell_type": "markdown", - "metadata": { - "id": "RQhMzFx11VrS" - }, - "source": [ - "We will only need the recipes rows containing 'cake', 'cookie', 'bread' in their names. We will only use 'name', 'desription', 'ingredients', 'steps' columns." - ] - }, - { - "cell_type": "code", - "execution_count": 8, - "metadata": { - "id": "RixWkB4577J1" - }, - "outputs": [], - "source": [ - "#delete missing value function \n", - "def MissingValues (data):\n", - " if(data.isnull().values.any()): \n", - " columns = data.columns\n", - " for column in columns: \n", - " data[data[column].isnull()] = \"\"\n", - " data[data[column]=='NaN'] = \"\"\n", - " data[pd.isna(data[column])] = \"\"\n", - " return data\n" - ] - }, - { - "cell_type": "code", - "execution_count": 9, - "metadata": { - "id": "_giBR4CC8Tzb" - }, - "outputs": [], - "source": [ - "data = MissingValues(data)" - ] - }, - { - "cell_type": "code", - "execution_count": 10, - "metadata": { - "colab": { - "base_uri": "https://localhost:8080/" - }, - "id": "10CxNT9R68uV", - "outputId": "58615983-e69a-4e4b-fcda-0fc1d1873eea" - }, - "outputs": [ - { - "output_type": "execute_result", - "data": { - "text/plain": [ - "14950" - ] - }, - "metadata": {}, - "execution_count": 10 - } - ], - "source": [ - "#determine rows we need to remove \n", - "remove1 = data.loc[data.name.map(lambda x: len(x)<4 )] #name of the recipe is too short \n", - "remove2 = data.loc[data.ingredients.map(lambda x: len(x)<2 )] #recipe has less then 2 ingredients\n", - "remove3 = data.loc[data.steps.map(lambda x: len(x)<2 )] #recipe has less then 2 steps\n", - "\n", - "len(remove1) + len(remove2) +len(remove3)" - ] - }, - { - "cell_type": "code", - "execution_count": 11, - "metadata": { - "id": "Iw35lDrBSkFq" - }, - "outputs": [], - "source": [ - "data.drop(data[data.name.map(lambda x: len(x)<4 )].index, inplace=True)\n", - "data.drop(data[data.ingredients.map(lambda x: len(x)<2 )].index, inplace=True)\n", - "data.drop(data[data.steps.map(lambda x: len(x)<2 )].index, inplace=True)\n" - ] - }, - { - "cell_type": "code", - "execution_count": 12, - "metadata": { - "colab": { - "base_uri": "https://localhost:8080/", - "height": 1000 - }, - "id": "Lk5OXdRJX5Bp", - "outputId": "2d00503c-d568-432c-fed1-27d0e7c94253" - }, - "outputs": [ - { - "output_type": "execute_result", - "data": { - "text/html": [ - "\n", - "
\n", - "
\n", - "
\n", - "\n", - "\n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - "
nameidminutescontributor_idsubmittedtagsnutritionn_stepsstepsdescriptioningredientsn_ingredients
0arriba baked winter squash mexican style13773955478922005-09-16['60-minutes-or-less', 'time-to-make', 'course...[51.5, 0.0, 13.0, 0.0, 2.0, 0.0, 4.0]11['make a choice and proceed with recipe', 'dep...autumn is my favorite time of year to cook! th...['winter squash', 'mexican seasoning', 'mixed ...7
1a bit different breakfast pizza3149030262782002-06-17['30-minutes-or-less', 'time-to-make', 'course...[173.4, 18.0, 0.0, 17.0, 22.0, 35.0, 1.0]9['preheat oven to 425 degrees f', 'press dough...this recipe calls for the crust to be prebaked...['prepared pizza crust', 'sausage patty', 'egg...6
2all in the kitchen chili1121401301965862005-02-25['time-to-make', 'course', 'preparation', 'mai...[269.8, 22.0, 32.0, 48.0, 39.0, 27.0, 5.0]6['brown ground beef in large pot', 'add choppe...this modified version of 'mom's' chili was a h...['ground beef', 'yellow onions', 'diced tomato...13
3alouette potatoes5938945685852003-04-14['60-minutes-or-less', 'time-to-make', 'course...[368.1, 17.0, 10.0, 2.0, 14.0, 8.0, 20.0]11['place potatoes in a large pot of lightly sal...this is a super easy, great tasting, make ahea...['spreadable cheese with garlic and herbs', 'n...11
4amish tomato ketchup for canning44061190417062002-10-25['weeknight', 'time-to-make', 'course', 'main-...[352.9, 1.0, 337.0, 23.0, 3.0, 0.0, 28.0]5['mix all ingredients& boil for 2 1 / 2 hours ...my dh's amish mother raised him on this recipe...['tomato juice', 'apple cider vinegar', 'sugar...8
.......................................
231632zydeco soup486161602279782012-08-29['ham', '60-minutes-or-less', 'time-to-make', ...[415.2, 26.0, 34.0, 26.0, 44.0, 21.0, 15.0]7['heat oil in a 4-quart dutch oven', 'add cele...this is a delicious soup that i originally fou...['celery', 'onion', 'green sweet pepper', 'gar...22
231633zydeco spice mix493372515006782013-01-09['15-minutes-or-less', 'time-to-make', 'course...[14.8, 0.0, 2.0, 58.0, 1.0, 0.0, 1.0]1['mix all ingredients together thoroughly']this spice mix will make your taste buds dance!['paprika', 'salt', 'garlic powder', 'onion po...13
231634zydeco ya ya deviled eggs30808040377792008-06-07['60-minutes-or-less', 'time-to-make', 'course...[59.2, 6.0, 2.0, 3.0, 6.0, 5.0, 0.0]7['in a bowl , combine the mashed yolks and may...deviled eggs, cajun-style['hard-cooked eggs', 'mayonnaise', 'dijon must...8
231635cookies by design cookies on a stick298512295068222008-04-15['30-minutes-or-less', 'time-to-make', 'course...[188.0, 11.0, 57.0, 11.0, 7.0, 21.0, 9.0]9['place melted butter in a large mixing bowl a...i've heard of the 'cookies by design' company,...['butter', 'eagle brand condensed milk', 'ligh...10
231636cookies by design sugar shortbread cookies298509205068222008-04-15['30-minutes-or-less', 'time-to-make', 'course...[174.9, 14.0, 33.0, 4.0, 4.0, 11.0, 6.0]5['whip sugar and shortening in a large bowl , ...i've heard of the 'cookies by design' company,...['granulated sugar', 'shortening', 'eggs', 'fl...7
\n", - "

226647 rows ร— 12 columns

\n", - "
\n", - " \n", - " \n", - " \n", - "\n", - " \n", - "
\n", - "
\n", - " " - ], - "text/plain": [ - " name ... n_ingredients\n", - "0 arriba baked winter squash mexican style ... 7\n", - "1 a bit different breakfast pizza ... 6\n", - "2 all in the kitchen chili ... 13\n", - "3 alouette potatoes ... 11\n", - "4 amish tomato ketchup for canning ... 8\n", - "... ... ... ...\n", - "231632 zydeco soup ... 22\n", - "231633 zydeco spice mix ... 13\n", - "231634 zydeco ya ya deviled eggs ... 8\n", - "231635 cookies by design cookies on a stick ... 10\n", - "231636 cookies by design sugar shortbread cookies ... 7\n", - "\n", - "[226647 rows x 12 columns]" - ] - }, - "metadata": {}, - "execution_count": 12 - } - ], - "source": [ - "data" - ] - }, - { - "cell_type": "code", - "execution_count": 13, - "metadata": { - "colab": { - "base_uri": "https://localhost:8080/", - "height": 1000 - }, - "id": "ehQtl4_t2C58", - "outputId": "1bc8bf2a-9fd0-42a2-b51a-afc0b5b23b8c" - }, - "outputs": [ - { - "output_type": "stream", - "name": "stdout", - "text": [ - "(27863, 8)\n" - ] - }, - { - "output_type": "execute_result", - "data": { - "text/html": [ - "\n", - "
\n", - "
\n", - "
\n", - "\n", - "\n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - "
namedescriptioningredientsn_ingredientsstepsn_stepsminutesr_type
9beat this banana breadfrom ann hodgman's['sugar', 'unsalted butter', 'bananas', 'eggs'...9['preheat oven to 350 degrees', 'butter two 9x...1270bread
32grilled ranch breadbuttery and['butter', 'dry ranch dressing mix', 'french b...3['cream the butter with the dressing mix', 'sp...413bread
47jeanne s style birthday cakea bakery in winnipeg is famous for this specia...['shortening', 'icing sugar', 'vanilla', 'all-...10['to prepare base , cut shortening into dry in...25230cake
48jiffy extra moist carrot cakethis is a very tasty, moist, carrot cake. a ni...['yellow cake mix', 'vanilla instant pudding m...11['preheat oven to 350 degrees', 'mix together ...850cake
49jiffy roasted corn and jalapeno cornbreadthis is a moist, easy, colorful and delicious ...['whole kernel corn', 'onion', 'red bell peppe...10['melt butter in a saut pan', 'add the corn , ...1035bread
...........................
231618zwetschgenkuchen plum cakethis is a classic german pastry and a deliciou...['margarine', 'all-purpose flour', 'sugar', 'b...13['prepare pastry: cut margarine into flour , s...1160cake
231621zwieback toast teething cookiesa quintessential childhood food. this is a co...['sugar', 'active dry yeast', 'milk', 'butter'...9['stir together 1 / 2 teaspoon of sugar , the ...23100cookie
231624zwiebelkuchen southwest german onion cakethis is a traditional late summer early fall s...['fresh yeast', 'milk', 'flour', 'butter', 'eg...13['for the dough:', 'dissolve the yeast in the ...1075cake
231635cookies by design cookies on a sticki've heard of the 'cookies by design' company,...['butter', 'eagle brand condensed milk', 'ligh...10['place melted butter in a large mixing bowl a...929cookie
231636cookies by design sugar shortbread cookiesi've heard of the 'cookies by design' company,...['granulated sugar', 'shortening', 'eggs', 'fl...7['whip sugar and shortening in a large bowl , ...520cookie
\n", - "

27863 rows ร— 8 columns

\n", - "
\n", - " \n", - " \n", - " \n", - "\n", - " \n", - "
\n", - "
\n", - " " - ], - "text/plain": [ - " name ... r_type\n", - "9 beat this banana bread ... bread\n", - "32 grilled ranch bread ... bread\n", - "47 jeanne s style birthday cake ... cake\n", - "48 jiffy extra moist carrot cake ... cake\n", - "49 jiffy roasted corn and jalapeno cornbread ... bread\n", - "... ... ... ...\n", - "231618 zwetschgenkuchen plum cake ... cake\n", - "231621 zwieback toast teething cookies ... cookie\n", - "231624 zwiebelkuchen southwest german onion cake ... cake\n", - "231635 cookies by design cookies on a stick ... cookie\n", - "231636 cookies by design sugar shortbread cookies ... cookie\n", - "\n", - "[27863 rows x 8 columns]" - ] - }, - "metadata": {}, - "execution_count": 13 - } - ], - "source": [ - "# filter the data\n", - "\n", - "data_filtered = data[(data['name'].str.contains(\"cake\")) | \n", - " (data['name'].str.contains(\"cookie\")) |\n", - " (data['name'].str.contains(\"bread\")) ][[\"name\", \"description\", \"ingredients\", \"n_ingredients\", \"steps\", \"n_steps\", \"minutes\"]]\n", - "\n", - "# create function to assign a recipe type\n", - "\n", - "def f_type(row):\n", - " if row['name'].find('cake') != -1:\n", - " val = 'cake'\n", - " elif row['name'].find('cookie') != -1:\n", - " val = 'cookie'\n", - " elif row['name'].find('bread') != -1:\n", - " val = 'bread'\n", - " else:\n", - " val ='unknown'\n", - " return val\n", - " \n", - "\n", - "data_filtered[\"r_type\"] = data_filtered.apply(f_type, axis = 1)\n", - "\n", - "print(data_filtered.shape)\n", - "data_filtered\n" - ] - }, - { - "cell_type": "markdown", - "metadata": { - "id": "O7P0Y2_vKe7S" - }, - "source": [ - "### Exploratory analysis (what do we know about cakes, cookies and bread?)" - ] - }, - { - "cell_type": "markdown", - "metadata": { - "id": "Oi_Py_GX77l6" - }, - "source": [ - "Let's see the distribution of the number of ingredients needed. This may show us the **complexity** of the recipe." - ] - }, - { - "cell_type": "code", - "execution_count": 14, - "metadata": { - "colab": { - "base_uri": "https://localhost:8080/", - "height": 562 - }, - "id": "PrDdTFFGKtyi", - "outputId": "ae63b32b-8fdf-473e-9ee1-1c24097b3ea2" - }, - "outputs": [ - { - "output_type": "display_data", - "data": { - "text/html": [ - "\n", - "\n", - "\n", - "
\n", - " \n", - " \n", - " \n", - "
\n", - " \n", - "
\n", - "\n", - "" - ] - }, - "metadata": {} - } - ], - "source": [ - "import plotly.figure_factory as ff\n", - "# let's see the distribution of the number of ingredients needed\n", - "cookie_n_ing = data_filtered.loc[data_filtered['r_type'] == 'cookie'][\"n_ingredients\"]\n", - "cake_n_ing = data_filtered.loc[data_filtered['r_type'] == 'cake'][\"n_ingredients\"]\n", - "bread_n_ing = data_filtered.loc[data_filtered['r_type'] == 'bread'][\"n_ingredients\"]\n", - "\n", - "hist_data = [cookie_n_ing.astype('float'), cake_n_ing.astype('float'), bread_n_ing.astype('float')]\n", - "group_labels = ['Cookie', 'Cake', 'Bread']\n", - "colors = ['rgb(239, 202, 8)', 'rgb(238, 66, 102)', 'rgb(0, 166, 166)']\n", - "\n", - "fig = ff.create_distplot(hist_data, group_labels, bin_size = 1, colors = colors)\n", - "fig.show()\n", - "#hist_data_n_ing\n" - ] - }, - { - "cell_type": "markdown", - "metadata": { - "id": "VXuqJXbH87EF" - }, - "source": [ - "Let's consider a number of steps in the recipe which is a good metric of the **recipe's difficulty** as well as **effort** to cook." - ] - }, - { - "cell_type": "code", - "execution_count": 15, - "metadata": { - "colab": { - "base_uri": "https://localhost:8080/", - "height": 562 - }, - "id": "UEiz1EXYwYMV", - "outputId": "85ffb908-36da-4b6e-a4b8-e9fe43c86acc" - }, - "outputs": [ - { - "output_type": "display_data", - "data": { - "text/html": [ - "\n", - "\n", - "\n", - "
\n", - " \n", - " \n", - " \n", - "
\n", - " \n", - "
\n", - "\n", - "" - ] - }, - "metadata": {} - } - ], - "source": [ - "# let's see the distribution of the number of steps in recipe\n", - "cookie_n_steps = data_filtered.loc[data_filtered['r_type'] == 'cookie'][\"n_steps\"]\n", - "cake_n_steps = data_filtered.loc[data_filtered['r_type'] == 'cake'][\"n_steps\"]\n", - "bread_n_steps = data_filtered.loc[data_filtered['r_type'] == 'bread'][\"n_steps\"]\n", - "\n", - "hist_data = [cookie_n_steps.astype('float'), cake_n_steps.astype('float'), bread_n_steps.astype('float')]\n", - "group_labels = ['Cookie', 'Cake', 'Bread']\n", - "colors = ['rgb(239, 202, 8)', 'rgb(238, 66, 102)', 'rgb(0, 166, 166)']\n", - "\n", - "fig = ff.create_distplot(hist_data, group_labels, bin_size = 1, colors = colors)\n", - "fig.show()" - ] - }, - { - "cell_type": "code", - "execution_count": 16, - "metadata": { - "id": "78ren9Dvn4lU" - }, - "outputs": [], - "source": [ - "# calculate time to cook and remove outliers \n", - "cookie_min = data_filtered.loc[data_filtered['r_type'] == 'cookie'][\"minutes\"]\n", - "cake_min = data_filtered.loc[data_filtered['r_type'] == 'cake'][\"minutes\"]\n", - "bread_min = data_filtered.loc[data_filtered['r_type'] == 'bread'][\"minutes\"]\n", - "\n", - "df1 = pd.DataFrame(cake_min[~((cake_min-cake_min.mean()).abs() > 1*cake_min.std())])\n", - "df2 = pd.DataFrame(cookie_min[~((cookie_min-cookie_min.mean()).abs() > 1*cookie_min.std())])\n", - "df3 = pd.DataFrame(bread_min[~((bread_min-bread_min.mean()).abs() > 1*bread_min.std())])\n" - ] - }, - { - "cell_type": "code", - "execution_count": 17, - "metadata": { - "colab": { - "base_uri": "https://localhost:8080/", - "height": 562 - }, - "id": "zUIY8DSPmY54", - "outputId": "82f33e57-eda2-4c96-a7db-7acb76e910f0" - }, - "outputs": [ - { - "output_type": "display_data", - "data": { - "text/html": [ - "\n", - "\n", - "\n", - "
\n", - " \n", - " \n", - " \n", - "
\n", - " \n", - "
\n", - "\n", - "" - ] - }, - "metadata": {} - } - ], - "source": [ - "import plotly.graph_objects as go\n", - "\n", - "fig = go.Figure()\n", - "fig.add_trace(go.Box(x=df1.minutes, name='Cake'))\n", - "fig.add_trace(go.Box(x=df2.minutes, name='Cookie'))\n", - "fig.add_trace(go.Box(x=df3.minutes, name='Bread'))\n", - "\n", - "fig.show()" - ] - }, - { - "cell_type": "markdown", - "metadata": { - "id": "psQvX4fu-GvP" - }, - "source": [ - "Here we notice that in both cases on average **cake** recipes seem to be more difficult to repeat as they require **more ingredients** and **more effort** i.e steps to folllow. \n", - "\n", - "For **cookies** we see **less variability** in number of ingredients and number of steps as well as time to cook. \n", - "\n", - "At the same time **bread** recipes are **very diverse**: from easy-to-follow 3 steps breads to very time consuming 145 steps recipe requiring 43 ingredients. On average **it takes longer to cook a bread** rather then cake or cookies.\n" - ] - }, - { - "cell_type": "markdown", - "metadata": { - "id": "53DFAGpH_Ukx" - }, - "source": [ - "### Text analysis (what makes a cookie that crunchy, cake that spongy, and bread that fluffy?)" - ] - }, - { - "cell_type": "markdown", - "metadata": { - "id": "xESjYloEP-BZ" - }, - "source": [ - "**Next we will see what makes a cookie that crunchy, cake that spongy, and \n", - "bread that fluffy**" - ] - }, - { - "cell_type": "code", - "execution_count": 18, - "metadata": { - "id": "l66GRmehJhFv" - }, - "outputs": [], - "source": [ - "# our ingredients contain some spechial charecters we won't need\n", - "from collections import Counter\n", - "#Counter(data_filtered['ingredients'].sum())\n", - "data_filtered['ingredients'] = data_filtered['ingredients'].str.replace('\\\"', '\\'').replace('\\[', '').replace('\\\"', '').replace('\\]', '')\n", - "data_filtered['steps'] = data_filtered['steps'].str.replace('\\\"', '\\'').replace('\\[', '').replace('\\\"', '').replace('\\]', '')" - ] - }, - { - "cell_type": "code", - "execution_count": 19, - "metadata": { - "id": "1-pDhyCWIRUP" - }, - "outputs": [], - "source": [ - "# calculate the most common ingredients for cakes, cookies and bread\n", - "\n", - "cookie_ing = dict(Counter(data_filtered.loc[data_filtered['r_type'] == 'cookie']['ingredients'].apply(lambda x: x[2:-2].split('\\', \\'')).sum()).most_common())\n", - "cake_ing = dict(Counter(data_filtered.loc[data_filtered['r_type'] == 'cake']['ingredients'].apply(lambda x: x[2:-2].split('\\', \\'')).sum()).most_common())\n", - "bread_ing = dict(Counter(data_filtered.loc[data_filtered['r_type'] == 'bread']['ingredients'].apply(lambda x: x[2:-2].split('\\', \\'')).sum()).most_common())" - ] - }, - { - "cell_type": "code", - "execution_count": 20, - "metadata": { - "colab": { - "base_uri": "https://localhost:8080/" - }, - "id": "kuWgY6FXbZ2A", - "outputId": "7890d772-f4f1-403a-a22b-4d72bcaa40af" - }, - "outputs": [ - { - "output_type": "execute_result", - "data": { - "text/plain": [ - "[('salt', 3666),\n", - " ('butter', 3259),\n", - " ('baking soda', 3242),\n", - " ('sugar', 2599),\n", - " ('flour', 2511),\n", - " ('vanilla', 2282),\n", - " ('eggs', 2193),\n", - " ('brown sugar', 2139),\n", - " ('egg', 2093),\n", - " ('all-purpose flour', 1933),\n", - " ('baking powder', 1881),\n", - " ('vanilla extract', 1497),\n", - " ('granulated sugar', 975),\n", - " ('cinnamon', 859),\n", - " ('unsalted butter', 730)]" - ] - }, - "metadata": {}, - "execution_count": 20 - } - ], - "source": [ - "# cookie most common ingredients\n", - "Counter(data_filtered.loc[data_filtered['r_type'] == 'cookie']['ingredients'].apply(lambda x: x[2:-2].split('\\', \\'')).sum()).most_common(15)" - ] - }, - { - "cell_type": "code", - "execution_count": 21, - "metadata": { - "colab": { - "base_uri": "https://localhost:8080/" - }, - "id": "iX0wFoLkcE3J", - "outputId": "c46ec290-dae4-454b-81a2-88f4dbbf0d46" - }, - "outputs": [ - { - "output_type": "execute_result", - "data": { - "text/plain": [ - "[('eggs', 8155),\n", - " ('sugar', 6591),\n", - " ('salt', 6291),\n", - " ('butter', 5755),\n", - " ('baking powder', 4906),\n", - " ('baking soda', 3861),\n", - " ('flour', 3723),\n", - " ('milk', 3217),\n", - " ('vanilla', 3203),\n", - " ('all-purpose flour', 3005),\n", - " ('vanilla extract', 2616),\n", - " ('water', 2367),\n", - " ('cream cheese', 2325),\n", - " ('cinnamon', 2153),\n", - " ('egg', 2120)]" - ] - }, - "metadata": {}, - "execution_count": 21 - } - ], - "source": [ - "# cake most common ingredients\n", - "Counter(data_filtered.loc[data_filtered['r_type'] == 'cake']['ingredients'].apply(lambda x: x[2:-2].split('\\', \\'')).sum()).most_common(15)" - ] - }, - { - "cell_type": "code", - "execution_count": 22, - "metadata": { - "colab": { - "base_uri": "https://localhost:8080/" - }, - "id": "EVz5Xsy8cRTl", - "outputId": "cd6acbd6-de5b-4802-d5ec-f7a91610ee36" - }, - "outputs": [ - { - "output_type": "execute_result", - "data": { - "text/plain": [ - "[('salt', 5367),\n", - " ('sugar', 3499),\n", - " ('butter', 2877),\n", - " ('eggs', 2755),\n", - " ('baking powder', 2083),\n", - " ('baking soda', 2011),\n", - " ('flour', 1913),\n", - " ('water', 1821),\n", - " ('milk', 1699),\n", - " ('all-purpose flour', 1608),\n", - " ('egg', 1403),\n", - " ('cinnamon', 1055),\n", - " ('whole wheat flour', 906),\n", - " ('bread flour', 877),\n", - " ('brown sugar', 848)]" - ] - }, - "metadata": {}, - "execution_count": 22 - } - ], - "source": [ - "# bread most common ingredients\n", - "Counter(data_filtered.loc[data_filtered['r_type'] == 'bread']['ingredients'].apply(lambda x: x[2:-2].split('\\', \\'')).sum()).most_common(15)" - ] - }, - { - "cell_type": "code", - "execution_count": 23, - "metadata": { - "id": "iu7RjqjEG4yP" - }, - "outputs": [], - "source": [ - "# make a nice visualization \n", - "from wordcloud import WordCloud\n", - "import matplotlib.pyplot as plt\n", - "\n", - "def makeImage(text):\n", - " wc = WordCloud(background_color=\"white\", max_words=50)\n", - " wc.generate_from_frequencies(text)\n", - " plt.imshow(wc, interpolation=\"bilinear\")\n", - " plt.axis(\"off\")\n", - " plt.show()\n" - ] - }, - { - "cell_type": "code", - "execution_count": 24, - "metadata": { - "colab": { - "base_uri": "https://localhost:8080/", - "height": 54 - }, - "id": "zVN3dc6RVpa4", - "outputId": "f80e04fd-43f0-4fec-88d5-8458e373e14e" - }, - "outputs": [ - { - "output_type": "display_data", - "data": { - "image/png": "\n", - "text/plain": [ - "
" - ] - }, - "metadata": { - "needs_background": "light" - } - } - ], - "source": [ - "makeImage(cookie_ing)" - ] - }, - { - "cell_type": "code", - "execution_count": 25, - "metadata": { - "colab": { - "base_uri": "https://localhost:8080/", - "height": 54 - }, - "id": "XDhCWb-jYS-l", - "outputId": "a2a4ac9e-431a-4188-beeb-b9b21f25f783" - }, - "outputs": [ - { - "output_type": "display_data", - "data": { - "image/png": "\n", - "text/plain": [ - "
" - ] - }, - "metadata": { - "needs_background": "light" - } - } - ], - "source": [ - "makeImage(cake_ing)" - ] - }, - { - "cell_type": "code", - "execution_count": 26, - "metadata": { - "colab": { - "base_uri": "https://localhost:8080/", - "height": 54 - }, - "id": "3cVYdNlqYXgV", - "outputId": "95636494-a253-4fdb-8896-c4835aff5c1f" - }, - "outputs": [ - { - "output_type": "display_data", - "data": { - "image/png": "\n", - "text/plain": [ - "
" - ] - }, - "metadata": { - "needs_background": "light" - } - } - ], - "source": [ - "makeImage(bread_ing)" - ] - }, - { - "cell_type": "markdown", - "metadata": { - "id": "oVZ1fmE-Yxm-" - }, - "source": [ - "### Dataset to plain text " - ] - }, - { - "cell_type": "code", - "execution_count": 27, - "metadata": { - "id": "nmqhPy8iYcUJ" - }, - "outputs": [], - "source": [ - "data2text = data_filtered[[\"name\", \"description\", \"ingredients\", \"steps\"]]" - ] - }, - { - "cell_type": "code", - "execution_count": 28, - "metadata": { - "colab": { - "base_uri": "https://localhost:8080/" - }, - "id": "qj-KDseSg3cQ", - "outputId": "d53c953f-3b69-48ce-bee7-018d1973ffdf" - }, - "outputs": [ - { - "output_type": "execute_result", - "data": { - "text/plain": [ - "name object\n", - "description object\n", - "ingredients object\n", - "steps object\n", - "dtype: object" - ] - }, - "metadata": {}, - "execution_count": 28 - } - ], - "source": [ - "data2text.dtypes" - ] - }, - { - "cell_type": "code", - "execution_count": 29, - "metadata": { - "id": "3nso7QL1go0D" - }, - "outputs": [], - "source": [ - "data2text.reset_index(drop=True, inplace=True)" - ] - }, - { - "cell_type": "code", - "execution_count": 30, - "metadata": { - "id": "rSuWJI5_gqjR" - }, - "outputs": [], - "source": [ - "data2text = data2text.rename(columns={'name': 'title'})" - ] - }, - { - "cell_type": "code", - "execution_count": 31, - "metadata": { - "id": "2GDr2m2vmwCL" - }, - "outputs": [], - "source": [ - "data2text[\"ingredients\"] = data2text[\"ingredients\"].str.replace(\"\\['\", \"\").str.replace(\"\\']\", \"\")\n", - "data2text[\"steps\"] = data2text[\"steps\"].str.replace(\"\\['\", \"\").str.replace(\"\\']\", \"\")" - ] - }, - { - "cell_type": "code", - "execution_count": 32, - "metadata": { - "colab": { - "base_uri": "https://localhost:8080/", - "height": 419 - }, - "id": "1TFBNP91Z7kU", - "outputId": "c7336027-1043-40c4-ba83-472e462f72a3" - }, - "outputs": [ - { - "output_type": "execute_result", - "data": { - "text/html": [ - "\n", - "
\n", - "
\n", - "
\n", - "\n", - "\n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - "
titledescriptioningredientssteps
0beat this banana breadfrom ann hodgman'ssugar', 'unsalted butter', 'bananas', 'eggs', ...preheat oven to 350 degrees', 'butter two 9x5'...
1grilled ranch breadbuttery andbutter', 'dry ranch dressing mix', 'french breadcream the butter with the dressing mix', 'spre...
2jeanne s style birthday cakea bakery in winnipeg is famous for this specia...shortening', 'icing sugar', 'vanilla', 'all-pu...to prepare base , cut shortening into dry ingr...
3jiffy extra moist carrot cakethis is a very tasty, moist, carrot cake. a ni...yellow cake mix', 'vanilla instant pudding mix...preheat oven to 350 degrees', 'mix together th...
4jiffy roasted corn and jalapeno cornbreadthis is a moist, easy, colorful and delicious ...whole kernel corn', 'onion', 'red bell pepper'...melt butter in a saut pan', 'add the corn , on...
...............
27858zwetschgenkuchen plum cakethis is a classic german pastry and a deliciou...margarine', 'all-purpose flour', 'sugar', 'bak...prepare pastry: cut margarine into flour , sug...
27859zwieback toast teething cookiesa quintessential childhood food. this is a co...sugar', 'active dry yeast', 'milk', 'butter', ...stir together 1 / 2 teaspoon of sugar , the ye...
27860zwiebelkuchen southwest german onion cakethis is a traditional late summer early fall s...fresh yeast', 'milk', 'flour', 'butter', 'egg'...for the dough:', 'dissolve the yeast in the lu...
27861cookies by design cookies on a sticki've heard of the 'cookies by design' company,...butter', 'eagle brand condensed milk', 'light ...place melted butter in a large mixing bowl and...
27862cookies by design sugar shortbread cookiesi've heard of the 'cookies by design' company,...granulated sugar', 'shortening', 'eggs', 'flou...whip sugar and shortening in a large bowl , ad...
\n", - "

27863 rows ร— 4 columns

\n", - "
\n", - " \n", - " \n", - " \n", - "\n", - " \n", - "
\n", - "
\n", - " " - ], - "text/plain": [ - " title ... steps\n", - "0 beat this banana bread ... preheat oven to 350 degrees', 'butter two 9x5'...\n", - "1 grilled ranch bread ... cream the butter with the dressing mix', 'spre...\n", - "2 jeanne s style birthday cake ... to prepare base , cut shortening into dry ingr...\n", - "3 jiffy extra moist carrot cake ... preheat oven to 350 degrees', 'mix together th...\n", - "4 jiffy roasted corn and jalapeno cornbread ... melt butter in a saut pan', 'add the corn , on...\n", - "... ... ... ...\n", - "27858 zwetschgenkuchen plum cake ... prepare pastry: cut margarine into flour , sug...\n", - "27859 zwieback toast teething cookies ... stir together 1 / 2 teaspoon of sugar , the ye...\n", - "27860 zwiebelkuchen southwest german onion cake ... for the dough:', 'dissolve the yeast in the lu...\n", - "27861 cookies by design cookies on a stick ... place melted butter in a large mixing bowl and...\n", - "27862 cookies by design sugar shortbread cookies ... whip sugar and shortening in a large bowl , ad...\n", - "\n", - "[27863 rows x 4 columns]" - ] - }, - "metadata": {}, - "execution_count": 32 - } - ], - "source": [ - "data2text" - ] - }, - { - "cell_type": "code", - "execution_count": 33, - "metadata": { - "id": "rD1LcAAVdLp9" - }, - "outputs": [], - "source": [ - "STOP_WORD_TITLE = '๐Ÿ“Œ TITLE\\n\\n'\n", - "STOP_WORD_DESCRIPTION = '\\n๐Ÿ‘€ DESCRIPTION\\n\\n'\n", - "STOP_WORD_INGREDIENTS = '\\n๐Ÿ’ INGREDIENTS\\n\\n'\n", - "STOP_WORD_INSTRUCTIONS = '\\n๐Ÿ“ INSTRUCTIONS\\n\\n'" - ] - }, - { - "cell_type": "code", - "execution_count": 34, - "metadata": { - "id": "jVBXuY2EdVfL" - }, - "outputs": [], - "source": [ - "def recipe_to_string(recipe):\n", - " result = []\n", - " for index, row in data2text.iterrows(): \n", - " title = row.title\n", - " description = row.description\n", - " ingredients = row.ingredients.split('\\', \\'')\n", - " instructions = row.steps.split('\\', \\'')\n", - " ingredients_string = ''\n", - " for ingredient in ingredients:\n", - " if ingredient:\n", - " ingredients_string += f'โ€ข {ingredient}\\n' \n", - " instructions_string = ''\n", - " for instruction in instructions:\n", - " if instruction:\n", - " instructions_string += f'โ–ช๏ธŽ {instruction}\\n'\n", - " result.append(f'{STOP_WORD_TITLE}{title}\\n{STOP_WORD_DESCRIPTION}{description}\\n{STOP_WORD_INGREDIENTS}{ingredients_string}{STOP_WORD_INSTRUCTIONS}{instructions_string}')\n", - " return result" - ] - }, - { - "cell_type": "code", - "execution_count": 35, - "metadata": { - "id": "GlvFPGA81E-o" - }, - "outputs": [], - "source": [ - "dataset_stringified = recipe_to_string(data2text) " - ] - }, - { - "cell_type": "code", - "execution_count": 36, - "metadata": { - "colab": { - "base_uri": "https://localhost:8080/" - }, - "id": "BUitJk6wvh7I", - "outputId": "94e1c591-34ec-4b91-9513-e24d9764c9fd" - }, - "outputs": [ - { - "output_type": "stream", - "name": "stdout", - "text": [ - "Stringified dataset size: 27863\n" - ] - } - ], - "source": [ - "print('Stringified dataset size: ', len(dataset_stringified))" - ] - }, - { - "cell_type": "code", - "execution_count": 37, - "metadata": { - "colab": { - "base_uri": "https://localhost:8080/" - }, - "id": "pV-dhXcDySFl", - "outputId": "caa7d082-e136-42ab-bcc2-a8bd17d4bd14" - }, - "outputs": [ - { - "output_type": "stream", - "name": "stdout", - "text": [ - "Recipe #1\n", - "---------\n", - "๐Ÿ“Œ TITLE\n", - "\n", - "beat this banana bread\n", - "\n", - "๐Ÿ‘€ DESCRIPTION\n", - "\n", - "from ann hodgman's \n", - "\n", - "๐Ÿ’ INGREDIENTS\n", - "\n", - "โ€ข sugar\n", - "โ€ข unsalted butter\n", - "โ€ข bananas\n", - "โ€ข eggs\n", - "โ€ข fresh lemon juice\n", - "โ€ข orange rind\n", - "โ€ข cake flour\n", - "โ€ข baking soda\n", - "โ€ข salt\n", - "\n", - "๐Ÿ“ INSTRUCTIONS\n", - "\n", - "โ–ช๏ธŽ preheat oven to 350 degrees\n", - "โ–ช๏ธŽ butter two 9x5' loaf pans\n", - "โ–ช๏ธŽ cream the sugar and the butter until light and whipped\n", - "โ–ช๏ธŽ add the bananas , eggs , lemon juice , orange rind\n", - "โ–ช๏ธŽ beat until blended uniformly\n", - "โ–ช๏ธŽ be patient , and beat until the banana lumps are gone\n", - "โ–ช๏ธŽ sift the dry ingredients together\n", - "โ–ช๏ธŽ fold lightly and thoroughly into the banana mixture\n", - "โ–ช๏ธŽ pour the batter into prepared loaf pans\n", - "โ–ช๏ธŽ bake for 45 to 55 minutes , until the loaves are firm in the middle and the edges begin to pull away from the pans\n", - "โ–ช๏ธŽ cool the loaves on racks for 30 minutes before removing from the pans\n", - "โ–ช๏ธŽ freezes well\n", - "\n", - "\n", - "\n", - "Recipe #2\n", - "---------\n", - "๐Ÿ“Œ TITLE\n", - "\n", - "grilled ranch bread\n", - "\n", - "๐Ÿ‘€ DESCRIPTION\n", - "\n", - "buttery and \n", - "\n", - "๐Ÿ’ INGREDIENTS\n", - "\n", - "โ€ข butter\n", - "โ€ข dry ranch dressing mix\n", - "โ€ข french bread\n", - "\n", - "๐Ÿ“ INSTRUCTIONS\n", - "\n", - "โ–ช๏ธŽ cream the butter with the dressing mix\n", - "โ–ช๏ธŽ spread evenly on the bread halves\n", - "โ–ช๏ธŽ place under the broiler for 2-3 minutes , until golden and the butter begins to seep and bubble\n", - "โ–ช๏ธŽ serve hot\n", - "\n", - "\n", - "\n", - "Recipe #3\n", - "---------\n", - "๐Ÿ“Œ TITLE\n", - "\n", - "jeanne s style birthday cake\n", - "\n", - "๐Ÿ‘€ DESCRIPTION\n", - "\n", - "a bakery in winnipeg is famous for this special cake and ship it to all parts of canada. a recipe request column in the winnipeg free press printed this copycat recipe submitted by a reader. enjoy !\n", - "\n", - "๐Ÿ’ INGREDIENTS\n", - "\n", - "โ€ข shortening\n", - "โ€ข icing sugar\n", - "โ€ข vanilla\n", - "โ€ข all-purpose flour\n", - "โ€ข baking powder\n", - "โ€ข sugar\n", - "โ€ข eggs\n", - "โ€ข salt\n", - "โ€ข milk\n", - "โ€ข butter\n", - "\n", - "๐Ÿ“ INSTRUCTIONS\n", - "\n", - "โ–ช๏ธŽ to prepare base , cut shortening into dry ingredients , mix well\n", - "โ–ช๏ธŽ pat firmly and evenly into an 8 inch square pan and bake at 350 deg\n", - "โ–ช๏ธŽ f\n", - "โ–ช๏ธŽ for 10-12 minutes\n", - "โ–ช๏ธŽ cool\n", - "โ–ช๏ธŽ for cake: cream shortening , and sugar\n", - "โ–ช๏ธŽ add eggs and vanilla , beating well until fluffy\n", - "โ–ช๏ธŽ sift flour , baking powder and salt together\n", - "โ–ช๏ธŽ add to creamed mixture alternately with milk\n", - "โ–ช๏ธŽ pour batter into a greased and floured 8 inch square pan\n", - "โ–ช๏ธŽ bake at 350 deg\n", - "โ–ช๏ธŽ f for 25 - 40 minutes\n", - "โ–ช๏ธŽ frosting: in small saucepan , stir tog\n", - "โ–ช๏ธŽ milk and flour\n", - "โ–ช๏ธŽ cook , stirring constantly , until mixute is thickened and smooth\n", - "โ–ช๏ธŽ cool\n", - "โ–ช๏ธŽ on highest speed of mixer , beat cooled flour mixture with butter , shortening and vanilla until smooth and fluffy\n", - "โ–ช๏ธŽ blend in icing sugar and salt\n", - "โ–ช๏ธŽ continue beating until frosting is very fluffy\n", - "โ–ช๏ธŽ this will take at least 15 minutes\n", - "โ–ช๏ธŽ to assemble cake: place shortbread base on serving plate\n", - "โ–ช๏ธŽ spread with small amount of frosting\n", - "โ–ช๏ธŽ place cake on base\n", - "โ–ช๏ธŽ cover top and sides of cake with remaining frosting\n", - "โ–ช๏ธŽ if desired , garnish sides of cake with shaved semi-sweet chocolate\n", - "\n", - "\n", - "\n" - ] - } - ], - "source": [ - "for recipe_index, recipe_string in enumerate(dataset_stringified[:3]):\n", - " print('Recipe #{}\\n---------'.format(recipe_index + 1))\n", - " print(recipe_string)\n", - " print('\\n')" - ] - }, - { - "cell_type": "markdown", - "metadata": { - "id": "sLTAaK3nMd1g" - }, - "source": [ - "Let's see how many characters our recipes have" - ] - }, - { - "cell_type": "code", - "execution_count": 38, - "metadata": { - "colab": { - "base_uri": "https://localhost:8080/", - "height": 265 - }, - "id": "LLr20-NlyZYW", - "outputId": "e189d521-077c-4110-a569-4b7d05a9752f" - }, - "outputs": [ - { - "output_type": "display_data", - "data": { - "image/png": "iVBORw0KGgoAAAANSUhEUgAAAX0AAAD4CAYAAAAAczaOAAAABHNCSVQICAgIfAhkiAAAAAlwSFlzAAALEgAACxIB0t1+/AAAADh0RVh0U29mdHdhcmUAbWF0cGxvdGxpYiB2ZXJzaW9uMy4yLjIsIGh0dHA6Ly9tYXRwbG90bGliLm9yZy+WH4yJAAAU00lEQVR4nO3df6zd9X3f8eerEJIurWI78SzPJrOjWqnoHyH0CoxSVVlYjIEoZlIakUXDoUyeNjYl66TWLH+wJo1EtqlpUFdSVNyZiIZQmgyLpKWeQzRNGoRLoYSf9YXAsAX4BhOyBjUt6Xt/nM8lB+de33vxuef6+vN8SEfn831/P+frz+d8r1/n3O/3e85NVSFJ6sNPLfcAJEnjY+hLUkcMfUnqiKEvSR0x9CWpI6cv9wCO521ve1tt2rRpuYchSSvKfffd992qWjvbupM69Ddt2sTk5ORyD0OSVpQkT8+1zsM7ktQRQ1+SOmLoS1JHDH1J6oihL0kdMfQlqSPzhn6SdyZ5YOj2/SSfSLImyf4kB9v96tY/Sa5LMpXkwSTnDG1rZ+t/MMnOpZyYJOknzRv6VfV4VZ1dVWcDvwi8DHwV2A0cqKotwIG2DHARsKXddgHXAyRZA1wDnAecC1wz80IhSRqPxR7euQB4oqqeBnYAe1t9L3Bpa+8AbqqBu4FVSdYDFwL7q+poVb0I7Ae2n/AMJEkLtthP5F4GfKm111XVs639HLCutTcAzww95lCrzVV/jSS7GPyGwNvf/vZFDu/EbNr9tVnrT117yVjHIUlLZcHv9JOcAXwQ+ONj19Xgz2+N5E9wVdUNVTVRVRNr18761RGSpNdpMYd3LgL+oqqeb8vPt8M2tPsjrX4YOHPocRtbba66JGlMFhP6H+HHh3YA9gEzV+DsBG4fql/eruLZCrzUDgPdCWxLsrqdwN3WapKkMVnQMf0kbwbeD/yrofK1wK1JrgSeBj7c6l8HLgamGFzpcwVAVR1N8mng3tbvU1V19IRnIElasAWFflX9AHjrMbUXGFzNc2zfAq6aYzt7gD2LH6YkaRT8RK4kdcTQl6SOGPqS1BFDX5I6YuhLUkcMfUnqiKEvSR0x9CWpI4a+JHXE0Jekjhj6ktQRQ1+SOmLoS1JHDH1J6oihL0kdMfQlqSOGviR1xNCXpI4Y+pLUEUNfkjqyoNBPsirJbUkeS/JokvOTrEmyP8nBdr+69U2S65JMJXkwyTlD29nZ+h9MsnOpJiVJmt1C3+l/Hvizqvp54F3Ao8Bu4EBVbQEOtGWAi4At7bYLuB4gyRrgGuA84FzgmpkXCknSeJw+X4ckbwF+GfgYQFX9LfC3SXYA723d9gLfBH4D2AHcVFUF3N1+S1jf+u6vqqNtu/uB7cCXRjedpbFp99dmrT917SVjHokknZiFvNPfDEwDf5jk/iR/kOTNwLqqerb1eQ5Y19obgGeGHn+o1eaqS5LGZCGhfzpwDnB9Vb0b+AE/PpQDQHtXX6MYUJJdSSaTTE5PT49ik5KkZiGhfwg4VFX3tOXbGLwIPN8O29Duj7T1h4Ezhx6/sdXmqr9GVd1QVRNVNbF27drFzEWSNI95Q7+qngOeSfLOVroAeATYB8xcgbMTuL219wGXt6t4tgIvtcNAdwLbkqxuJ3C3tZokaUzmPZHb/Dvg5iRnAE8CVzB4wbg1yZXA08CHW9+vAxcDU8DLrS9VdTTJp4F7W79PzZzUlSSNx4JCv6oeACZmWXXBLH0LuGqO7ewB9ixmgJKk0fETuZLUEUNfkjpi6EtSRwx9SeqIoS9JHTH0Jakjhr4kdcTQl6SOGPqS1BFDX5I6YuhLUkcMfUnqiKEvSR0x9CWpI4a+JHXE0Jekjhj6ktQRQ1+SOmLoS1JHDH1J6oihL0kdWVDoJ3kqybeTPJBkstXWJNmf5GC7X93qSXJdkqkkDyY5Z2g7O1v/g0l2Ls2UJElzWcw7/X9SVWdX1URb3g0cqKotwIG2DHARsKXddgHXw+BFArgGOA84F7hm5oVCkjQeJ3J4Zwewt7X3ApcO1W+qgbuBVUnWAxcC+6vqaFW9COwHtp/Avy9JWqSFhn4Bf57kviS7Wm1dVT3b2s8B61p7A/DM0GMPtdpc9ddIsivJZJLJ6enpBQ5PkrQQpy+w3y9V1eEk/xDYn+Sx4ZVVVUlqFAOqqhuAGwAmJiZGsk1J0sCC3ulX1eF2fwT4KoNj8s+3wza0+yOt+2HgzKGHb2y1ueqSpDGZN/STvDnJz860gW3AQ8A+YOYKnJ3A7a29D7i8XcWzFXipHQa6E9iWZHU7gbut1SRJY7KQwzvrgK8mmen/R1X1Z0nuBW5NciXwNPDh1v/rwMXAFPAycAVAVR1N8mng3tbvU1V1dGQzkSTNa97Qr6ongXfNUn8BuGCWegFXzbGtPcCexQ9TkjQKfiJXkjpi6EtSRwx9SeqIoS9JHTH0Jakjhr4kdcTQl6SOGPqS1BFDX5I6YuhLUkcMfUnqiKEvSR0x9CWpI4a+JHXE0Jekjhj6ktQRQ1+SOmLoS1JHDH1J6oihL0kdWXDoJzktyf1J7mjLm5Pck2QqyZeTnNHqb2zLU239pqFtXN3qjye5cNSTkSQd32Le6X8ceHRo+bPA56rq54AXgStb/UrgxVb/XOtHkrOAy4BfALYDv5fktBMbviRpMU5fSKckG4FLgM8Av5YkwPuAf9667AX+E3A9sKO1AW4Dfrf13wHcUlU/BL6TZAo4F/g/I5nJImza/bUl3c5T114yku1L0qgt9J3+7wC/Dvx9W34r8L2qeqUtHwI2tPYG4BmAtv6l1v/V+iyPkSSNwbyhn+QDwJGqum8M4yHJriSTSSanp6fH8U9KUjcW8k7/PcAHkzwF3MLgsM7ngVVJZg4PbQQOt/Zh4EyAtv4twAvD9Vke86qquqGqJqpqYu3atYuekCRpbvOGflVdXVUbq2oTgxOx36iqjwJ3AR9q3XYCt7f2vrZMW/+NqqpWv6xd3bMZ2AJ8a2QzkSTNa0EncufwG8AtSX4LuB+4sdVvBL7YTtQeZfBCQVU9nORW4BHgFeCqqvrRCfz7kqRFWlToV9U3gW+29pMMrr45ts/fAL8yx+M/w+AKIEnSMvATuZLUEUNfkjpi6EtSRwx9SeqIoS9JHTH0Jakjhr4kdcTQl6SOGPqS1BFDX5I6YuhLUkcMfUnqiKEvSR0x9CWpI4a+JHXE0Jekjhj6ktQRQ1+SOmLoS1JHDH1J6oihL0kdmTf0k7wpybeS/GWSh5P8ZqtvTnJPkqkkX05yRqu/sS1PtfWbhrZ1das/nuTCpZqUJGl2C3mn/0PgfVX1LuBsYHuSrcBngc9V1c8BLwJXtv5XAi+2+udaP5KcBVwG/AKwHfi9JKeNcjKSpOObN/Rr4K/b4hvarYD3Abe1+l7g0tbe0ZZp6y9Ikla/pap+WFXfAaaAc0cyC0nSgizomH6S05I8ABwB9gNPAN+rqldal0PAhtbeADwD0Na/BLx1uD7LY4b/rV1JJpNMTk9PL35GkqQ5LSj0q+pHVXU2sJHBu/OfX6oBVdUNVTVRVRNr165dqn9Gkrq0qKt3qup7wF3A+cCqJKe3VRuBw619GDgToK1/C/DCcH2Wx0iSxmAhV++sTbKqtX8aeD/wKIPw/1DrthO4vbX3tWXa+m9UVbX6Ze3qns3AFuBbo5qIJGl+p8/fhfXA3nalzU8Bt1bVHUkeAW5J8lvA/cCNrf+NwBeTTAFHGVyxQ1U9nORW4BHgFeCqqvrRaKcjSTqeeUO/qh4E3j1L/Ulmufqmqv4G+JU5tvUZ4DOLH6YkaRT8RK4kdcTQl6SOGPqS1BFDX5I6YuhLUkcMfUnqiKEvSR0x9CWpI4a+JHXE0Jekjhj6ktQRQ1+SOmLoS1JHFvLVylqkTbu/Nmv9qWsvGfNIJOm1fKcvSR0x9CWpI4a+JHXE0Jekjhj6ktQRQ1+SOmLoS1JH5g39JGcmuSvJI0keTvLxVl+TZH+Sg+1+dasnyXVJppI8mOScoW3tbP0PJtm5dNOSJM1mIe/0XwH+Q1WdBWwFrkpyFrAbOFBVW4ADbRngImBLu+0CrofBiwRwDXAecC5wzcwLhSRpPOYN/ap6tqr+orX/H/AosAHYAext3fYCl7b2DuCmGrgbWJVkPXAhsL+qjlbVi8B+YPtIZyNJOq5FHdNPsgl4N3APsK6qnm2rngPWtfYG4Jmhhx1qtbnqx/4bu5JMJpmcnp5ezPAkSfNYcOgn+RngT4BPVNX3h9dVVQE1igFV1Q1VNVFVE2vXrh3FJiVJzYJCP8kbGAT+zVX1lVZ+vh22od0fafXDwJlDD9/YanPVJUljspCrdwLcCDxaVb89tGofMHMFzk7g9qH65e0qnq3AS+0w0J3AtiSr2wncba0mSRqThXy18nuAfwF8O8kDrfYfgWuBW5NcCTwNfLit+zpwMTAFvAxcAVBVR5N8Gri39ftUVR0dySwkSQsyb+hX1f8GMsfqC2bpX8BVc2xrD7BnMQOUJI2On8iVpI4Y+pLUEUNfkjpi6EtSRwx9SeqIoS9JHTH0Jakjhr4kdcTQl6SOLORrGDQim3Z/bdb6U9deMuaRSOqV7/QlqSOGviR1xNCXpI4Y+pLUEUNfkjpi6EtSRwx9SeqIoS9JHTH0Jakjhr4kdWTe0E+yJ8mRJA8N1dYk2Z/kYLtf3epJcl2SqSQPJjln6DE7W/+DSXYuzXQkScezkHf6/x3YfkxtN3CgqrYAB9oywEXAlnbbBVwPgxcJ4BrgPOBc4JqZFwpJ0vjMG/pV9b+Ao8eUdwB7W3svcOlQ/aYauBtYlWQ9cCGwv6qOVtWLwH5+8oVEkrTEXu8x/XVV9WxrPwesa+0NwDND/Q612lz1n5BkV5LJJJPT09Ovc3iSpNmc8IncqiqgRjCWme3dUFUTVTWxdu3aUW1WksTrD/3n22Eb2v2RVj8MnDnUb2OrzVWXJI3R6w39fcDMFTg7gduH6pe3q3i2Ai+1w0B3AtuSrG4ncLe1miRpjOb9y1lJvgS8F3hbkkMMrsK5Frg1yZXA08CHW/evAxcDU8DLwBUAVXU0yaeBe1u/T1XVsSeHJUlLLIND8ieniYmJmpycHPl25/qzhScb/4yipNcjyX1VNTHbOj+RK0kdMfQlqSOGviR1xNCXpI4Y+pLUEUNfkjoy73X6K9lKuTRTksbFd/qS1BFDX5I6ckof3lnp5jo85Sd1Jb1evtOXpI4Y+pLUEUNfkjpi6EtSRzyRuwJ5glfS6+U7fUnqiKEvSR0x9CWpI4a+JHXEE7mnkON9wZwneSWBod8Nr/iRBMtweCfJ9iSPJ5lKsnvc/74k9Wys7/STnAb8N+D9wCHg3iT7quqRcY5DP7bYvzngbwbSyjbuwzvnAlNV9SRAkluAHYChv0KM6g/T+OIhLY9xh/4G4Jmh5UPAecMdkuwCdrXFv07y+AK3/Tbguyc8wuV3Ksxj3jnks2MayevXxX5YIU6FeYx7Dv94rhUn3YncqroBuGGxj0syWVUTSzCksToV5uEcTg6nwhzg1JjHyTSHcZ/IPQycObS8sdUkSWMw7tC/F9iSZHOSM4DLgH1jHoMkdWush3eq6pUk/xa4EzgN2FNVD49o84s+JHSSOhXm4RxODqfCHODUmMdJM4dU1XKPQZI0Jn73jiR1xNCXpI6cEqF/Mn+1Q5Izk9yV5JEkDyf5eKuvSbI/ycF2v7rVk+S6NpcHk5wztK2drf/BJDuXYS6nJbk/yR1teXOSe9pYv9xOzpPkjW15qq3fNLSNq1v98SQXjnn8q5LcluSxJI8mOX+l7Yck/779HD2U5EtJ3rQS9kOSPUmOJHloqDay5z7JLyb5dnvMdUkypjn8l/bz9GCSryZZNbRu1ud4rryaaz+OXFWt6BuDE8JPAO8AzgD+Ejhrucc1NL71wDmt/bPAXwFnAf8Z2N3qu4HPtvbFwJ8CAbYC97T6GuDJdr+6tVePeS6/BvwRcEdbvhW4rLW/APzr1v43wBda+zLgy619Vts/bwQ2t/122hjHvxf4l619BrBqJe0HBh9u/A7w00PP/8dWwn4Afhk4B3hoqDay5x74Vuub9tiLxjSHbcDprf3ZoTnM+hxznLyaaz+OfB7j+GFd4h+m84E7h5avBq5e7nEdZ7y3M/juoceB9a22Hni8tX8f+MhQ/8fb+o8Avz9Uf02/MYx7I3AAeB9wR/vP9d2hH/hX9wODq7POb+3TW78cu2+G+41h/G9hEJg5pr5i9gM//kT7mva83gFcuFL2A7DpmMAcyXPf1j02VH9Nv6WcwzHr/hlwc2vP+hwzR14d7//TqG+nwuGd2b7aYcMyjeW42q/X7wbuAdZV1bNt1XPAutaeaz7LPc/fAX4d+Pu2/Fbge1X1yizjeXWsbf1Lrf9yzmEzMA38YTtE9QdJ3swK2g9VdRj4r8D/BZ5l8Lzex8raD8NG9dxvaO1j6+P2qwx+y4DFz+F4/59G6lQI/RUhyc8AfwJ8oqq+P7yuBi/tJ+21s0k+ABypqvuWeywn4HQGv5pfX1XvBn7A4JDCq1bAfljN4AsKNwP/CHgzsH1ZBzUiJ/tzP58knwReAW5e7rHM51QI/ZP+qx2SvIFB4N9cVV9p5eeTrG/r1wNHWn2u+SznPN8DfDDJU8AtDA7xfB5YlWTmA37D43l1rG39W4AXWN45HAIOVdU9bfk2Bi8CK2k//FPgO1U1XVV/B3yFwb5ZSfth2Kie+8OtfWx9LJJ8DPgA8NH24gWLn8MLzL0fR+pUCP2T+qsd2lUENwKPVtVvD63aB8xcfbCTwbH+mfrl7QqGrcBL7VfgO4FtSVa3d3zbWm3JVdXVVbWxqjYxeH6/UVUfBe4CPjTHHGbm9qHWv1r9snZVyWZgC4MTcOOYw3PAM0ne2UoXMPhK7xWzHxgc1tma5B+0n6uZOayY/XCMkTz3bd33k2xtz8vlQ9taUkm2Mzjs+cGqenlo1VzP8ax51fbLXPtxtJbyxM24bgzO9v8Vg7Pin1zu8Rwztl9i8Gvrg8AD7XYxg2N4B4CDwP8E1rT+YfCHZp4Avg1MDG3rV4GpdrtimebzXn589c47GPwgTwF/DLyx1d/Ulqfa+ncMPf6TbW6PswRXWMwz9rOBybYv/geDK0BW1H4AfhN4DHgI+CKDq0NO+v0AfInBeYi/Y/Bb15WjfO6BifacPAH8LsecsF/COUwxOEY/83/7C/M9x8yRV3Ptx1Hf/BoGSerIqXB4R5K0QIa+JHXE0Jekjhj6ktQRQ1+SOmLoS1JHDH1J6sj/B69xojMM57+nAAAAAElFTkSuQmCC\n", - "text/plain": [ - "
" - ] - }, - "metadata": { - "needs_background": "light" - } - } - ], - "source": [ - "recipes_lengths = []\n", - "for recipe_text in dataset_stringified:\n", - " recipes_lengths.append(len(recipe_text))\n", - "\n", - "plt.hist(recipes_lengths, bins=50)\n", - "plt.show()" - ] - }, - { - "cell_type": "code", - "execution_count": 39, - "metadata": { - "colab": { - "base_uri": "https://localhost:8080/", - "height": 268 - }, - "id": "cXWD8ggTMPmU", - "outputId": "e9d90303-0406-48b0-c9a2-efca1b0a6098" - }, - "outputs": [ - { - "output_type": "display_data", - "data": { - "image/png": "iVBORw0KGgoAAAANSUhEUgAAAX0AAAD7CAYAAACG50QgAAAABHNCSVQICAgIfAhkiAAAAAlwSFlzAAALEgAACxIB0t1+/AAAADh0RVh0U29mdHdhcmUAbWF0cGxvdGxpYiB2ZXJzaW9uMy4yLjIsIGh0dHA6Ly9tYXRwbG90bGliLm9yZy+WH4yJAAASf0lEQVR4nO3df4xdZ33n8fenzg+6BTUOmVpe2+yY1qvKSFsTjUIQaJWCSJyk2rQSQk5XxWIjudpNJNBW2nVaaUPbRQqrFrZoaYq7sRoqSsgWUCxwm7ohVVWpJHFKSOKk3gzBKLZMbAiEVkhoHb77x30cLs6M59edO5553i/p6p77Pc898zyjO5977nPOPZOqQpLUh59Y6Q5IksbH0Jekjhj6ktQRQ1+SOmLoS1JHDH1J6sicoZ/kNUkeSfLVJEeS/Harb03ycJLpJJ9JckmrX9oeT7f1k0Pbur3Vjya5brkGJUma2Xz29H8AvKOqfgHYAexMcjXwYeCjVfVzwHeAW1r7W4DvtPpHWzuSbAd2AW8CdgJ/mGTdKAcjSTq/i+ZqUINvb/1ze3hxuxXwDuBXW/0e4IPAXcBNbRngz4H/lSStfm9V/QD4epJp4Crg72f72VdccUVNTk4uaECS1LvHHnvsW1U1MdO6OUMfoO2RPwb8HPBx4GvAd6vqTGtyHNjUljcBzwNU1ZkkLwGvb/UvD212+Dkzmpyc5PDhw/PpoiSpSfKN2dbN60BuVb1cVTuAzQz2zn9+RH17lSR7khxOcvj06dPL9WMkqUsLOnunqr4LPAS8FbgsydlPCpuBE235BLAFoK3/aeDbw/UZnjP8M/ZV1VRVTU1MzPjpRJK0SPM5e2ciyWVt+SeBdwHPMAj/d7dmu4H72/KB9pi2/kvtuMABYFc7u2crsA14ZFQDkSTNbT5z+huBe9q8/k8A91XVF5I8Ddyb5L8DXwHubu3vBv60Hah9kcEZO1TVkST3AU8DZ4Bbq+rl0Q5HknQ+uZAvrTw1NVUeyJWkhUnyWFVNzbTOb+RKUkcMfUnqiKEvSR0x9CWpI/P6Rq6W1+TeLy6o/bE7b1ymnkha69zTl6SOGPqS1BFDX5I6YuhLUkcMfUnqiKEvSR0x9CWpI4a+JHXE0Jekjhj6ktQRQ1+SOuK1d1ah812rx+vySDof9/QlqSOGviR1xNCXpI4Y+pLUEUNfkjpi6EtSRwx9SeqIoS9JHTH0Jakjhr4kdWTO0E+yJclDSZ5OciTJ+1v9g0lOJHm83W4Yes7tSaaTHE1y3VB9Z6tNJ9m7PEOSJM1mPtfeOQP8RlX9Q5LXAY8lOdTWfbSqfm+4cZLtwC7gTcC/BP46yb9uqz8OvAs4Djya5EBVPT2KgUiS5jZn6FfVSeBkW/6nJM8Am87zlJuAe6vqB8DXk0wDV7V101X1HECSe1tbQ1+SxmRBc/pJJoE3Aw+30m1JnkiyP8n6VtsEPD/0tOOtNltdkjQm8w79JK8FPgt8oKq+B9wF/Cywg8Engd8fRYeS7ElyOMnh06dPj2KTkqRmXqGf5GIGgf+pqvocQFW9UFUvV9UPgT/mR1M4J4AtQ0/f3Gqz1X9MVe2rqqmqmpqYmFjoeCRJ5zGfs3cC3A08U1UfGapvHGr2K8BTbfkAsCvJpUm2AtuAR4BHgW1Jtia5hMHB3gOjGYYkaT7mc/bO24BfA55M8nir/SZwc5IdQAHHgF8HqKojSe5jcID2DHBrVb0MkOQ24AFgHbC/qo6McCySpDnM5+ydvwMyw6qD53nOh4APzVA/eL7nSZKWl/8jd4zO979tJWkcvAyDJHXE0Jekjji9s8bMNoV07M4bx9wTSRci9/QlqSOGviR1xNCXpI4Y+pLUEUNfkjpi6EtSRwx9SeqIoS9JHTH0Jakjhr4kdcTQl6SOGPqS1BFDX5I6YuhLUkcMfUnqiKEvSR0x9CWpI4a+JHXE0Jekjhj6ktQRQ1+SOmLoS1JHDH1J6sicoZ9kS5KHkjyd5EiS97f65UkOJXm23a9v9ST5WJLpJE8kuXJoW7tb+2eT7F6+YUmSZjKfPf0zwG9U1XbgauDWJNuBvcCDVbUNeLA9Brge2NZue4C7YPAmAdwBvAW4Crjj7BuFJGk8LpqrQVWdBE625X9K8gywCbgJuKY1uwf4G+C/tvonq6qALye5LMnG1vZQVb0IkOQQsBP49AjHo1lM7v3ijPVjd9445p5IWkkLmtNPMgm8GXgY2NDeEAC+CWxoy5uA54eedrzVZqtLksZk3qGf5LXAZ4EPVNX3hte1vfoaRYeS7ElyOMnh06dPj2KTkqRmXqGf5GIGgf+pqvpcK7/Qpm1o96da/QSwZejpm1tttvqPqap9VTVVVVMTExMLGYskaQ7zOXsnwN3AM1X1kaFVB4CzZ+DsBu4fqr+3ncVzNfBSmwZ6ALg2yfp2APfaVpMkjcmcB3KBtwG/BjyZ5PFW+03gTuC+JLcA3wDe09YdBG4ApoHvA+8DqKoXk/wu8Ghr9ztnD+pKksZjPmfv/B2QWVa/c4b2Bdw6y7b2A/sX0kFJ0uj4jVxJ6oihL0kdMfQlqSOGviR1xNCXpI4Y+pLUEUNfkjpi6EtSRwx9SeqIoS9JHTH0Jakjhr4kdcTQl6SOzOfSylqg2f4f7YXI/50r9cU9fUnqiKEvSR0x9CWpI4a+JHXE0Jekjhj6ktQRQ1+SOmLoS1JHDH1J6oihL0kdMfQlqSOGviR1xNCXpI7MGfpJ9ic5leSpodoHk5xI8ni73TC07vYk00mOJrluqL6z1aaT7B39UCRJc5nPnv6fADtnqH+0qna020GAJNuBXcCb2nP+MMm6JOuAjwPXA9uBm1tbSdIYzXk9/ar62yST89zeTcC9VfUD4OtJpoGr2rrpqnoOIMm9re3TC+6xJGnRljKnf1uSJ9r0z/pW2wQ8P9TmeKvNVpckjdFiQ/8u4GeBHcBJ4PdH1aEke5IcTnL49OnTo9qsJIlFhn5VvVBVL1fVD4E/5kdTOCeALUNNN7fabPWZtr2vqqaqampiYmIx3ZMkzWJRoZ9k49DDXwHOntlzANiV5NIkW4FtwCPAo8C2JFuTXMLgYO+BxXdbkrQYcx7ITfJp4BrgiiTHgTuAa5LsAAo4Bvw6QFUdSXIfgwO0Z4Bbq+rltp3bgAeAdcD+qjoy8tFIks5rPmfv3DxD+e7ztP8Q8KEZ6geBgwvqnSRppOYMffVpcu8XZ6wfu/PGMfdE0ih5GQZJ6oihL0kdMfQlqSOGviR1xNCXpI4Y+pLUEUNfkjpi6EtSRwx9SeqIoS9JHTH0Jakjhr4kdcTQl6SOGPqS1BFDX5I6YuhLUkcMfUnqiKEvSR0x9CWpI4a+JHXE0Jekjhj6ktQRQ1+SOnLRSndAq8vk3i/OWD92541j7omkxXBPX5I6YuhLUkfmDP0k+5OcSvLUUO3yJIeSPNvu17d6knwsyXSSJ5JcOfSc3a39s0l2L89wJEnnM589/T8Bdp5T2ws8WFXbgAfbY4DrgW3ttge4CwZvEsAdwFuAq4A7zr5RSJLGZ87Qr6q/BV48p3wTcE9bvgf45aH6J2vgy8BlSTYC1wGHqurFqvoOcIhXv5FIkpbZYs/e2VBVJ9vyN4ENbXkT8PxQu+OtNlt9VZvtTBZJulAt+UBuVRVQI+gLAEn2JDmc5PDp06dHtVlJEosP/RfatA3t/lSrnwC2DLXb3Gqz1V+lqvZV1VRVTU1MTCyye5KkmSw29A8AZ8/A2Q3cP1R/bzuL52rgpTYN9ABwbZL17QDuta0mSRqjOef0k3wauAa4IslxBmfh3Ancl+QW4BvAe1rzg8ANwDTwfeB9AFX1YpLfBR5t7X6nqs49OCxJWmZzhn5V3TzLqnfO0LaAW2fZzn5g/4J6J0kaKa+9o5HwmjzS6uBlGCSpI4a+JHXE0Jekjhj6ktQRQ1+SOmLoS1JHDH1J6oihL0kdMfQlqSOGviR1xNCXpI4Y+pLUES+4pmXlhdikC4t7+pLUEUNfkjpi6EtSRwx9SeqIoS9JHTH0Jakjhr4kdcTQl6SOGPqS1BFDX5I64mUYtCK8PIO0MtzTl6SOGPqS1JElhX6SY0meTPJ4ksOtdnmSQ0mebffrWz1JPpZkOskTSa4cxQAkSfM3ij39X6yqHVU11R7vBR6sqm3Ag+0xwPXAtnbbA9w1gp8tSVqA5ZjeuQm4py3fA/zyUP2TNfBl4LIkG5fh50uSZrHUs3cK+KskBXyiqvYBG6rqZFv/TWBDW94EPD/03OOtdhKp8aweaXktNfTfXlUnkvwMcCjJPw6vrKpqbwjzlmQPg+kf3vCGNyyxe5KkYUua3qmqE+3+FPB54CrghbPTNu3+VGt+Atgy9PTNrXbuNvdV1VRVTU1MTCyle5Kkcyw69JP8VJLXnV0GrgWeAg4Au1uz3cD9bfkA8N52Fs/VwEtD00CSpDFYyvTOBuDzSc5u58+q6i+TPArcl+QW4BvAe1r7g8ANwDTwfeB9S/jZkqRFWHToV9VzwC/MUP828M4Z6gXcutifJ0laOr+RK0kd8YJr8zDbaYSStNq4py9JHXFPX6uCX9qSRsM9fUnqiKEvSR0x9CWpI4a+JHXEA7la1c53Oq0HeaVXc09fkjpi6EtSR5ze0Zrluf3Sq7mnL0kdMfQlqSOGviR1xDl9dce5fvXMPX1J6oihL0kdMfQlqSPO6UuNc/3qgaEvzcE3A60lTu9IUkfc05cWyU8AWo3c05ekjrinL42YnwB0ITP0h5zvH3JI0lpg6EtjstBPAH5i0HIYe+gn2Qn8AbAO+N9Vdee4+yBdSPyEqXEaa+gnWQd8HHgXcBx4NMmBqnp6nP2QVrOFvkn4yUDDxr2nfxUwXVXPASS5F7gJMPSlZTLKTxJORa1+4w79TcDzQ4+PA28Zcx/8OC0t0kL/di60v7XFvAmttTe0C+5AbpI9wJ728J+THF3C5q4AvrX0Xq0qvY25t/GCY160fHgEPVmGbc1iKWP+V7OtGHfonwC2DD3e3GqvqKp9wL5R/LAkh6tqahTbWi16G3Nv4wXH3IvlGvO4v5H7KLAtydYklwC7gANj7oMkdWuse/pVdSbJbcADDE7Z3F9VR8bZB0nq2djn9KvqIHBwTD9uJNNEq0xvY+5tvOCYe7EsY05VLcd2JUkXIK+yKUkdWZOhn2RnkqNJppPsXen+LEWS/UlOJXlqqHZ5kkNJnm3361s9ST7Wxv1EkiuHnrO7tX82ye6VGMt8JdmS5KEkTyc5kuT9rb5mx53kNUkeSfLVNubfbvWtSR5uY/tMOwGCJJe2x9Nt/eTQtm5v9aNJrluZEc1PknVJvpLkC+3xWh/vsSRPJnk8yeFWG+/ruqrW1I3BAeKvAW8ELgG+Cmxf6X4tYTz/FrgSeGqo9j+AvW15L/DhtnwD8BdAgKuBh1v9cuC5dr++La9f6bGdZ8wbgSvb8uuA/wtsX8vjbn1/bVu+GHi4jeU+YFer/xHwH9vyfwL+qC3vAj7Tlre31/ylwNb2t7Bupcd3nnH/Z+DPgC+0x2t9vMeAK86pjfV1veK/hGX4pb4VeGDo8e3A7SvdryWOafKc0D8KbGzLG4GjbfkTwM3ntgNuBj4xVP+xdhf6DbifwfWauhg38C+Af2DwbfVvARe1+iuvbQZnwL21LV/U2uXc1/twuwvtxuB7Og8C7wC+0Pq/Zsfb+jdT6I/1db0Wp3dmutTDphXqy3LZUFUn2/I3gQ1tebaxr9rfSfsY/2YGe75retxtquNx4BRwiMFe63er6kxrMtz/V8bW1r8EvJ7VNeb/CfwX4Ift8etZ2+MFKOCvkjzWrj4AY35dX3CXYdDCVFUlWZOnYCV5LfBZ4ANV9b0kr6xbi+OuqpeBHUkuAz4P/PwKd2nZJPkl4FRVPZbkmpXuzxi9vapOJPkZ4FCSfxxeOY7X9Vrc05/zUg9rwAtJNgK0+1OtPtvYV93vJMnFDAL/U1X1uVZe8+MGqKrvAg8xmN64LMnZnbPh/r8ytrb+p4Fvs3rG/Dbg3yU5BtzLYIrnD1i74wWgqk60+1MM3tivYsyv67UY+j1c6uEAcPaI/W4Gc95n6+9tR/2vBl5qHxsfAK5Nsr6dGXBtq12QMtilvxt4pqo+MrRqzY47yUTbwyfJTzI4hvEMg/B/d2t27pjP/i7eDXypBhO8B4Bd7WyXrcA24JHxjGL+qur2qtpcVZMM/ka/VFX/njU6XoAkP5XkdWeXGbwen2Lcr+uVPrCxTAdLbmBwxsfXgN9a6f4scSyfBk4C/4/B3N0tDOYyHwSeBf4auLy1DYN/UvM14Elgamg7/wGYbrf3rfS45hjz2xnMfT4BPN5uN6zlcQP/BvhKG/NTwH9r9TcyCLFp4P8Al7b6a9rj6bb+jUPb+q32uzgKXL/SY5vH2K/hR2fvrNnxtrF9td2OnM2mcb+u/UauJHVkLU7vSJJmYehLUkcMfUnqiKEvSR0x9CWpI4a+JHXE0Jekjhj6ktSR/w98oJqWTDzXFQAAAABJRU5ErkJggg==\n", - "text/plain": [ - "
" - ] - }, - "metadata": { - "needs_background": "light" - } - } - ], - "source": [ - "# For a closer examination lets consider recipes shorter then 5000 characters\n", - "plt.hist(recipes_lengths, range=(0, 5000), bins=50)\n", - "plt.show()" - ] - }, - { - "cell_type": "code", - "execution_count": 40, - "metadata": { - "id": "xAPiEpplMaRs" - }, - "outputs": [], - "source": [ - "MAX_RECIPE_LENGTH = 2000" - ] - }, - { - "cell_type": "code", - "execution_count": 41, - "metadata": { - "colab": { - "base_uri": "https://localhost:8080/" - }, - "id": "oXyK4r-DR8RH", - "outputId": "f8aacf82-cdaa-48a2-c853-2ecd4a16d04d" - }, - "outputs": [ - { - "output_type": "stream", - "name": "stdout", - "text": [ - "Dataset size BEFORE filtering: 27863\n", - "Dataset size AFTER filtering: 26062\n", - "Number of eliminated recipes: 1801\n" - ] - } - ], - "source": [ - "def filter_recipes_by_length(recipe_test):\n", - " return len(recipe_test) <= MAX_RECIPE_LENGTH \n", - "\n", - "dataset_filtered = [recipe_text for recipe_text in dataset_stringified if filter_recipes_by_length(recipe_text)]\n", - "\n", - "print('Dataset size BEFORE filtering: ', len(dataset_stringified))\n", - "print('Dataset size AFTER filtering: ', len(dataset_filtered))\n", - "print('Number of eliminated recipes: ', len(dataset_stringified) - len(dataset_filtered))" - ] - }, - { - "cell_type": "markdown", - "metadata": { - "id": "FLF8-MNJSQfh" - }, - "source": [ - "### Creating Vocabulary" - ] - }, - { - "cell_type": "code", - "execution_count": 42, - "metadata": { - "colab": { - "base_uri": "https://localhost:8080/" - }, - "id": "upyJzPr9SVJg", - "outputId": "a44ef963-d4d2-411a-dc85-0d3e4c7de6f4" - }, - "outputs": [ - { - "output_type": "execute_result", - "data": { - "text/plain": [ - "{'char_level': True,\n", - " 'document_count': 26063,\n", - " 'filters': '',\n", - " 'index_docs': '{\"1\": 26062, \"71\": 332, \"61\": 7536, \"12\": 26061, \"51\": 26062, \"28\": 24713, \"22\": 26013, \"2\": 26062, \"14\": 26058, \"42\": 20638, \"17\": 26061, \"48\": 26062, \"58\": 12158, \"11\": 26062, \"18\": 26034, \"7\": 26062, \"10\": 26061, \"34\": 23968, \"50\": 26062, \"54\": 26062, \"35\": 26062, \"3\": 26062, \"30\": 26062, \"16\": 26056, \"56\": 12988, \"40\": 26062, \"55\": 10980, \"52\": 26062, \"6\": 26062, \"31\": 26062, \"5\": 26062, \"15\": 26054, \"41\": 26062, \"47\": 26062, \"46\": 13270, \"21\": 26012, \"13\": 26059, \"19\": 26043, \"24\": 26062, \"29\": 25690, \"49\": 26062, \"23\": 26062, \"9\": 26062, \"27\": 26062, \"4\": 26062, \"33\": 26062, \"44\": 21103, \"53\": 26062, \"25\": 26062, \"32\": 26062, \"39\": 26062, \"20\": 26047, \"8\": 26062, \"43\": 21056, \"26\": 25905, \"38\": 20037, \"45\": 17710, \"63\": 6519, \"36\": 22903, \"57\": 10546, \"37\": 20846, \"62\": 5632, \"60\": 8227, \"65\": 4413, \"67\": 3513, \"59\": 10272, \"70\": 2764, \"78\": 149, \"68\": 1803, \"64\": 5275, \"74\": 620, \"66\": 2121, \"69\": 1965, \"88\": 51, \"72\": 668, \"80\": 76, \"81\": 34, \"82\": 82, \"73\": 693, \"84\": 76, \"83\": 77, \"75\": 216, \"76\": 283, \"77\": 117, \"91\": 22, \"94\": 16, \"79\": 100, \"90\": 33, \"96\": 16, \"92\": 23, \"93\": 17, \"106\": 3, \"87\": 44, \"85\": 67, \"86\": 64, \"103\": 9, \"98\": 13, \"97\": 14, \"100\": 14, \"104\": 4, \"120\": 1, \"121\": 1, \"101\": 11, \"89\": 36, \"95\": 20, \"99\": 13, \"114\": 1, \"122\": 1, \"107\": 3, \"123\": 1, \"108\": 3, \"109\": 3, \"110\": 3, \"105\": 5, \"115\": 2, \"116\": 2, \"124\": 1, \"117\": 1, \"102\": 3, \"111\": 2, \"118\": 2, \"119\": 2, \"112\": 3, \"125\": 1, \"126\": 1, \"127\": 1, \"128\": 1, \"129\": 1, \"130\": 1, \"113\": 3}',\n", - " 'index_word': '{\"1\": \" \", \"2\": \"e\", \"3\": \"a\", \"4\": \"t\", \"5\": \"o\", \"6\": \"i\", \"7\": \"n\", \"8\": \"r\", \"9\": \"s\", \"10\": \"l\", \"11\": \"\\\\n\", \"12\": \"d\", \"13\": \"u\", \"14\": \"c\", \"15\": \"h\", \"16\": \"g\", \"17\": \"m\", \"18\": \"p\", \"19\": \"b\", \"20\": \"f\", \"21\": \"w\", \"22\": \"k\", \"23\": \"\\\\u25aa\", \"24\": \"\\\\ufe0e\", \"25\": \"\\\\u2022\", \"26\": \"y\", \"27\": \"I\", \"28\": \",\", \"29\": \"v\", \"30\": \"T\", \"31\": \"N\", \"32\": \"E\", \"33\": \"S\", \"34\": \"x\", \"35\": \"R\", \"36\": \".\", \"37\": \"1\", \"38\": \"-\", \"39\": \"D\", \"40\": \"C\", \"41\": \"O\", \"42\": \"0\", \"43\": \"5\", \"44\": \"3\", \"45\": \"2\", \"46\": \"\\'\", \"47\": \"\\\\ud83d\\\\udccc\", \"48\": \"L\", \"49\": \"\\\\ud83d\\\\udc40\", \"50\": \"P\", \"51\": \"\\\\ud83c\\\\udf52\", \"52\": \"G\", \"53\": \"\\\\ud83d\\\\udcdd\", \"54\": \"U\", \"55\": \"z\", \"56\": \"j\", \"57\": \"!\", \"58\": \"4\", \"59\": \"/\", \"60\": \"q\", \"61\": \"9\", \"62\": \":\", \"63\": \"8\", \"64\": \"7\", \"65\": \"6\", \"66\": \"\\\\\"\", \"67\": \")\", \"68\": \"&\", \"69\": \"\\\\r\", \"70\": \"(\", \"71\": \"*\", \"72\": \"#\", \"73\": \";\", \"74\": \"?\", \"75\": \"\\\\u2019\", \"76\": \"%\", \"77\": \"=\", \"78\": \"@\", \"79\": \"+\", \"80\": \"~\", \"81\": \"_\", \"82\": \"\\\\\\\\\", \"83\": \"[\", \"84\": \"]\", \"85\": \"\\\\u201d\", \"86\": \"\\\\u201c\", \"87\": \"\\\\u2014\", \"88\": \"$\", \"89\": \"\\\\u00e9\", \"90\": \"\\\\u2013\", \"91\": \">\", \"92\": \"<\", \"93\": \"^\", \"94\": \"\\\\u00bd\", \"95\": \"`\", \"96\": \"\\\\u2026\", \"97\": \"}\", \"98\": \"{\", \"99\": \"\\\\u00ae\", \"100\": \"\\\\u2018\", \"101\": \"\\\\u00b0\", \"102\": \"|\", \"103\": \"\\\\u00e8\", \"104\": \"\\\\u00a0\", \"105\": \"\\\\u00f1\", \"106\": \"\\\\u00a9\", \"107\": \"\\\\u00e4\", \"108\": \"\\\\u00ef\", \"109\": \"\\\\u00bf\", \"110\": \"\\\\u00fc\", \"111\": \"\\\\u00bc\", \"112\": \"\\\\u00ee\", \"113\": \"\\\\u2122\", \"114\": \"\\\\u00fa\", \"115\": \"\\\\u00b4\", \"116\": \"\\\\u00e7\", \"117\": \"\\\\u00fb\", \"118\": \"\\\\u00f3\", \"119\": \"\\\\u00f6\", \"120\": \"\\\\u00f4\", \"121\": \"\\\\u00be\", \"122\": \"\\\\u00ed\", \"123\": \"\\\\u00ba\", \"124\": \"\\\\u00e2\", \"125\": \"\\\\u00ea\", \"126\": \"\\\\t\", \"127\": \"\\\\u00eb\", \"128\": \"\\\\u00a2\", \"129\": \"\\\\u00f9\", \"130\": \"\\\\u00e0\"}',\n", - " 'lower': False,\n", - " 'num_words': None,\n", - " 'oov_token': None,\n", - " 'split': '',\n", - " 'word_counts': '{\"*\": 854, \"\\\\ud83d\\\\udccc\": 26062, \" \": 4351982, \"T\": 156372, \"I\": 182434, \"L\": 26062, \"E\": 104248, \"\\\\n\": 875409, \"b\": 407092, \"e\": 2188153, \"a\": 1562715, \"t\": 1507990, \"h\": 656263, \"i\": 1293026, \"s\": 1050269, \"n\": 1255003, \"r\": 1231768, \"d\": 732758, \"\\\\ud83d\\\\udc40\": 26062, \"D\": 52124, \"S\": 104248, \"C\": 52124, \"R\": 78186, \"P\": 26062, \"O\": 52124, \"N\": 130310, \"f\": 373111, \"o\": 1405310, \"m\": 498925, \"g\": 515216, \"\\'\": 27039, \"\\\\ud83c\\\\udf52\": 26062, \"G\": 26062, \"\\\\u2022\": 249672, \"u\": 662939, \"l\": 901468, \"j\": 24245, \"c\": 662778, \"k\": 297192, \"\\\\ud83d\\\\udcdd\": 26062, \"U\": 26062, \"\\\\u25aa\": 283108, \"\\\\ufe0e\": 283108, \"p\": 471700, \"v\": 169059, \"3\": 41026, \"5\": 43404, \"0\": 48546, \"w\": 303623, \"9\": 9143, \"x\": 89013, \",\": 172710, \"y\": 225370, \"4\": 18365, \"z\": 25223, \"2\": 37800, \"-\": 53332, \".\": 67292, \"q\": 11984, \"!\": 18552, \"8\": 8293, \"1\": 55874, \":\": 8808, \")\": 4259, \"6\": 5260, \"/\": 18210, \"(\": 3216, \"@\": 169, \"&\": 4188, \"7\": 6286, \"?\": 771, \"\\\\\"\": 4754, \"\\\\r\": 3820, \"$\": 58, \"#\": 812, \"~\": 125, \"_\": 113, \"\\\\\\\\\": 100, \";\": 801, \"[\": 96, \"]\": 96, \"\\\\u2019\": 309, \"%\": 304, \"=\": 170, \">\": 32, \"\\\\u00bd\": 24, \"+\": 167, \"\\\\u2013\": 37, \"\\\\u2026\": 18, \"^\": 27, \"<\": 30, \"\\\\u00a9\": 3, \"\\\\u2014\": 60, \"\\\\u201c\": 81, \"\\\\u201d\": 84, \"\\\\u00e8\": 9, \"{\": 17, \"}\": 18, \"\\\\u2018\": 14, \"\\\\u00a0\": 7, \"\\\\u00f4\": 1, \"\\\\u00be\": 1, \"\\\\u00b0\": 11, \"\\\\u00e9\": 42, \"`\": 22, \"\\\\u00ae\": 16, \"\\\\u00fa\": 2, \"\\\\u00ed\": 1, \"\\\\u00e4\": 3, \"\\\\u00ba\": 1, \"\\\\u00ef\": 3, \"\\\\u00bf\": 3, \"\\\\u00fc\": 3, \"\\\\u00f1\": 5, \"\\\\u00b4\": 2, \"\\\\u00e7\": 2, \"\\\\u00e2\": 1, \"\\\\u00fb\": 2, \"|\": 10, \"\\\\u00bc\": 3, \"\\\\u00f3\": 2, \"\\\\u00f6\": 2, \"\\\\u00ee\": 3, \"\\\\u00ea\": 1, \"\\\\t\": 1, \"\\\\u00eb\": 1, \"\\\\u00a2\": 1, \"\\\\u00f9\": 1, \"\\\\u00e0\": 1, \"\\\\u2122\": 3}',\n", - " 'word_docs': '{\"*\": 332, \"9\": 7536, \"d\": 26061, \"\\\\ud83c\\\\udf52\": 26062, \",\": 24713, \"k\": 26013, \"e\": 26062, \"c\": 26058, \"0\": 20638, \"m\": 26061, \"L\": 26062, \"4\": 12158, \"\\\\n\": 26062, \"p\": 26034, \"n\": 26062, \"l\": 26061, \"x\": 23968, \"P\": 26062, \"U\": 26062, \"R\": 26062, \"a\": 26062, \"T\": 26062, \"g\": 26056, \"j\": 12988, \" \": 26062, \"C\": 26062, \"z\": 10980, \"G\": 26062, \"i\": 26062, \"N\": 26062, \"o\": 26062, \"h\": 26054, \"O\": 26062, \"\\\\ud83d\\\\udccc\": 26062, \"\\'\": 13270, \"w\": 26012, \"u\": 26059, \"b\": 26043, \"\\\\ufe0e\": 26062, \"v\": 25690, \"\\\\ud83d\\\\udc40\": 26062, \"\\\\u25aa\": 26062, \"s\": 26062, \"I\": 26062, \"t\": 26062, \"S\": 26062, \"3\": 21103, \"\\\\ud83d\\\\udcdd\": 26062, \"\\\\u2022\": 26062, \"E\": 26062, \"D\": 26062, \"f\": 26047, \"r\": 26062, \"5\": 21056, \"y\": 25905, \"-\": 20037, \"2\": 17710, \"8\": 6519, \".\": 22903, \"!\": 10546, \"1\": 20846, \":\": 5632, \"q\": 8227, \"6\": 4413, \")\": 3513, \"/\": 10272, \"(\": 2764, \"@\": 149, \"&\": 1803, \"7\": 5275, \"?\": 620, \"\\\\\"\": 2121, \"\\\\r\": 1965, \"$\": 51, \"#\": 668, \"~\": 76, \"_\": 34, \"\\\\\\\\\": 82, \";\": 693, \"]\": 76, \"[\": 77, \"\\\\u2019\": 216, \"%\": 283, \"=\": 117, \">\": 22, \"\\\\u00bd\": 16, \"+\": 100, \"\\\\u2013\": 33, \"\\\\u2026\": 16, \"<\": 23, \"^\": 17, \"\\\\u00a9\": 3, \"\\\\u2014\": 44, \"\\\\u201d\": 67, \"\\\\u201c\": 64, \"\\\\u00e8\": 9, \"{\": 13, \"}\": 14, \"\\\\u2018\": 14, \"\\\\u00a0\": 4, \"\\\\u00f4\": 1, \"\\\\u00be\": 1, \"\\\\u00b0\": 11, \"\\\\u00e9\": 36, \"`\": 20, \"\\\\u00ae\": 13, \"\\\\u00fa\": 1, \"\\\\u00ed\": 1, \"\\\\u00e4\": 3, \"\\\\u00ba\": 1, \"\\\\u00ef\": 3, \"\\\\u00bf\": 3, \"\\\\u00fc\": 3, \"\\\\u00f1\": 5, \"\\\\u00b4\": 2, \"\\\\u00e7\": 2, \"\\\\u00e2\": 1, \"\\\\u00fb\": 1, \"|\": 3, \"\\\\u00bc\": 2, \"\\\\u00f3\": 2, \"\\\\u00f6\": 2, \"\\\\u00ee\": 3, \"\\\\u00ea\": 1, \"\\\\t\": 1, \"\\\\u00eb\": 1, \"\\\\u00a2\": 1, \"\\\\u00f9\": 1, \"\\\\u00e0\": 1, \"\\\\u2122\": 3}',\n", - " 'word_index': '{\" \": 1, \"e\": 2, \"a\": 3, \"t\": 4, \"o\": 5, \"i\": 6, \"n\": 7, \"r\": 8, \"s\": 9, \"l\": 10, \"\\\\n\": 11, \"d\": 12, \"u\": 13, \"c\": 14, \"h\": 15, \"g\": 16, \"m\": 17, \"p\": 18, \"b\": 19, \"f\": 20, \"w\": 21, \"k\": 22, \"\\\\u25aa\": 23, \"\\\\ufe0e\": 24, \"\\\\u2022\": 25, \"y\": 26, \"I\": 27, \",\": 28, \"v\": 29, \"T\": 30, \"N\": 31, \"E\": 32, \"S\": 33, \"x\": 34, \"R\": 35, \".\": 36, \"1\": 37, \"-\": 38, \"D\": 39, \"C\": 40, \"O\": 41, \"0\": 42, \"5\": 43, \"3\": 44, \"2\": 45, \"\\'\": 46, \"\\\\ud83d\\\\udccc\": 47, \"L\": 48, \"\\\\ud83d\\\\udc40\": 49, \"P\": 50, \"\\\\ud83c\\\\udf52\": 51, \"G\": 52, \"\\\\ud83d\\\\udcdd\": 53, \"U\": 54, \"z\": 55, \"j\": 56, \"!\": 57, \"4\": 58, \"/\": 59, \"q\": 60, \"9\": 61, \":\": 62, \"8\": 63, \"7\": 64, \"6\": 65, \"\\\\\"\": 66, \")\": 67, \"&\": 68, \"\\\\r\": 69, \"(\": 70, \"*\": 71, \"#\": 72, \";\": 73, \"?\": 74, \"\\\\u2019\": 75, \"%\": 76, \"=\": 77, \"@\": 78, \"+\": 79, \"~\": 80, \"_\": 81, \"\\\\\\\\\": 82, \"[\": 83, \"]\": 84, \"\\\\u201d\": 85, \"\\\\u201c\": 86, \"\\\\u2014\": 87, \"$\": 88, \"\\\\u00e9\": 89, \"\\\\u2013\": 90, \">\": 91, \"<\": 92, \"^\": 93, \"\\\\u00bd\": 94, \"`\": 95, \"\\\\u2026\": 96, \"}\": 97, \"{\": 98, \"\\\\u00ae\": 99, \"\\\\u2018\": 100, \"\\\\u00b0\": 101, \"|\": 102, \"\\\\u00e8\": 103, \"\\\\u00a0\": 104, \"\\\\u00f1\": 105, \"\\\\u00a9\": 106, \"\\\\u00e4\": 107, \"\\\\u00ef\": 108, \"\\\\u00bf\": 109, \"\\\\u00fc\": 110, \"\\\\u00bc\": 111, \"\\\\u00ee\": 112, \"\\\\u2122\": 113, \"\\\\u00fa\": 114, \"\\\\u00b4\": 115, \"\\\\u00e7\": 116, \"\\\\u00fb\": 117, \"\\\\u00f3\": 118, \"\\\\u00f6\": 119, \"\\\\u00f4\": 120, \"\\\\u00be\": 121, \"\\\\u00ed\": 122, \"\\\\u00ba\": 123, \"\\\\u00e2\": 124, \"\\\\u00ea\": 125, \"\\\\t\": 126, \"\\\\u00eb\": 127, \"\\\\u00a2\": 128, \"\\\\u00f9\": 129, \"\\\\u00e0\": 130}'}" - ] - }, - "metadata": {}, - "execution_count": 42 - } - ], - "source": [ - "STOP_SIGN = '*'\n", - "\n", - "tokenizer = tf.keras.preprocessing.text.Tokenizer(\n", - " char_level=True,\n", - " filters='',\n", - " lower=False,\n", - " split=''\n", - ")\n", - "\n", - "# Stop word is not a part of recipes, but tokenizer must know about it as well.\n", - "tokenizer.fit_on_texts([STOP_SIGN])\n", - "\n", - "tokenizer.fit_on_texts(dataset_filtered)\n", - "\n", - "tokenizer.get_config()" - ] - }, - { - "cell_type": "code", - "source": [ - "tokenizer_json = tokenizer.to_json()" - ], - "metadata": { - "id": "gmQa_mtVjsdx" - }, - "execution_count": 43, - "outputs": [] - }, - { - "cell_type": "code", - "source": [ - "print(tokenizer_json)" - ], - "metadata": { - "colab": { - "base_uri": "https://localhost:8080/" - }, - "id": "A1xjvcQYj7Oj", - "outputId": "d3450fd6-09a7-43dd-b544-8378c98a3264" - }, - "execution_count": 44, - "outputs": [ - { - "output_type": "stream", - "name": "stdout", - "text": [ - "{\"class_name\": \"Tokenizer\", \"config\": {\"num_words\": null, \"filters\": \"\", \"lower\": false, \"split\": \"\", \"char_level\": true, \"oov_token\": null, \"document_count\": 26063, \"word_counts\": \"{\\\"*\\\": 854, \\\"\\\\ud83d\\\\udccc\\\": 26062, \\\" \\\": 4351982, \\\"T\\\": 156372, \\\"I\\\": 182434, \\\"L\\\": 26062, \\\"E\\\": 104248, \\\"\\\\n\\\": 875409, \\\"b\\\": 407092, \\\"e\\\": 2188153, \\\"a\\\": 1562715, \\\"t\\\": 1507990, \\\"h\\\": 656263, \\\"i\\\": 1293026, \\\"s\\\": 1050269, \\\"n\\\": 1255003, \\\"r\\\": 1231768, \\\"d\\\": 732758, \\\"\\\\ud83d\\\\udc40\\\": 26062, \\\"D\\\": 52124, \\\"S\\\": 104248, \\\"C\\\": 52124, \\\"R\\\": 78186, \\\"P\\\": 26062, \\\"O\\\": 52124, \\\"N\\\": 130310, \\\"f\\\": 373111, \\\"o\\\": 1405310, \\\"m\\\": 498925, \\\"g\\\": 515216, \\\"'\\\": 27039, \\\"\\\\ud83c\\\\udf52\\\": 26062, \\\"G\\\": 26062, \\\"\\\\u2022\\\": 249672, \\\"u\\\": 662939, \\\"l\\\": 901468, \\\"j\\\": 24245, \\\"c\\\": 662778, \\\"k\\\": 297192, \\\"\\\\ud83d\\\\udcdd\\\": 26062, \\\"U\\\": 26062, \\\"\\\\u25aa\\\": 283108, \\\"\\\\ufe0e\\\": 283108, \\\"p\\\": 471700, \\\"v\\\": 169059, \\\"3\\\": 41026, \\\"5\\\": 43404, \\\"0\\\": 48546, \\\"w\\\": 303623, \\\"9\\\": 9143, \\\"x\\\": 89013, \\\",\\\": 172710, \\\"y\\\": 225370, \\\"4\\\": 18365, \\\"z\\\": 25223, \\\"2\\\": 37800, \\\"-\\\": 53332, \\\".\\\": 67292, \\\"q\\\": 11984, \\\"!\\\": 18552, \\\"8\\\": 8293, \\\"1\\\": 55874, \\\":\\\": 8808, \\\")\\\": 4259, \\\"6\\\": 5260, \\\"/\\\": 18210, \\\"(\\\": 3216, \\\"@\\\": 169, \\\"&\\\": 4188, \\\"7\\\": 6286, \\\"?\\\": 771, \\\"\\\\\\\"\\\": 4754, \\\"\\\\r\\\": 3820, \\\"$\\\": 58, \\\"#\\\": 812, \\\"~\\\": 125, \\\"_\\\": 113, \\\"\\\\\\\\\\\": 100, \\\";\\\": 801, \\\"[\\\": 96, \\\"]\\\": 96, \\\"\\\\u2019\\\": 309, \\\"%\\\": 304, \\\"=\\\": 170, \\\">\\\": 32, \\\"\\\\u00bd\\\": 24, \\\"+\\\": 167, \\\"\\\\u2013\\\": 37, \\\"\\\\u2026\\\": 18, \\\"^\\\": 27, \\\"<\\\": 30, \\\"\\\\u00a9\\\": 3, \\\"\\\\u2014\\\": 60, \\\"\\\\u201c\\\": 81, \\\"\\\\u201d\\\": 84, \\\"\\\\u00e8\\\": 9, \\\"{\\\": 17, \\\"}\\\": 18, \\\"\\\\u2018\\\": 14, \\\"\\\\u00a0\\\": 7, \\\"\\\\u00f4\\\": 1, \\\"\\\\u00be\\\": 1, \\\"\\\\u00b0\\\": 11, \\\"\\\\u00e9\\\": 42, \\\"`\\\": 22, \\\"\\\\u00ae\\\": 16, \\\"\\\\u00fa\\\": 2, \\\"\\\\u00ed\\\": 1, \\\"\\\\u00e4\\\": 3, \\\"\\\\u00ba\\\": 1, \\\"\\\\u00ef\\\": 3, \\\"\\\\u00bf\\\": 3, \\\"\\\\u00fc\\\": 3, \\\"\\\\u00f1\\\": 5, \\\"\\\\u00b4\\\": 2, \\\"\\\\u00e7\\\": 2, \\\"\\\\u00e2\\\": 1, \\\"\\\\u00fb\\\": 2, \\\"|\\\": 10, \\\"\\\\u00bc\\\": 3, \\\"\\\\u00f3\\\": 2, \\\"\\\\u00f6\\\": 2, \\\"\\\\u00ee\\\": 3, \\\"\\\\u00ea\\\": 1, \\\"\\\\t\\\": 1, \\\"\\\\u00eb\\\": 1, \\\"\\\\u00a2\\\": 1, \\\"\\\\u00f9\\\": 1, \\\"\\\\u00e0\\\": 1, \\\"\\\\u2122\\\": 3}\", \"word_docs\": \"{\\\"*\\\": 332, \\\"9\\\": 7536, \\\"d\\\": 26061, \\\"\\\\ud83c\\\\udf52\\\": 26062, \\\",\\\": 24713, \\\"k\\\": 26013, \\\"e\\\": 26062, \\\"c\\\": 26058, \\\"0\\\": 20638, \\\"m\\\": 26061, \\\"L\\\": 26062, \\\"4\\\": 12158, \\\"\\\\n\\\": 26062, \\\"p\\\": 26034, \\\"n\\\": 26062, \\\"l\\\": 26061, \\\"x\\\": 23968, \\\"P\\\": 26062, \\\"U\\\": 26062, \\\"R\\\": 26062, \\\"a\\\": 26062, \\\"T\\\": 26062, \\\"g\\\": 26056, \\\"j\\\": 12988, \\\" \\\": 26062, \\\"C\\\": 26062, \\\"z\\\": 10980, \\\"G\\\": 26062, \\\"i\\\": 26062, \\\"N\\\": 26062, \\\"o\\\": 26062, \\\"h\\\": 26054, \\\"O\\\": 26062, \\\"\\\\ud83d\\\\udccc\\\": 26062, \\\"'\\\": 13270, \\\"w\\\": 26012, \\\"u\\\": 26059, \\\"b\\\": 26043, \\\"\\\\ufe0e\\\": 26062, \\\"v\\\": 25690, \\\"\\\\ud83d\\\\udc40\\\": 26062, \\\"\\\\u25aa\\\": 26062, \\\"s\\\": 26062, \\\"I\\\": 26062, \\\"t\\\": 26062, \\\"S\\\": 26062, \\\"3\\\": 21103, \\\"\\\\ud83d\\\\udcdd\\\": 26062, \\\"\\\\u2022\\\": 26062, \\\"E\\\": 26062, \\\"D\\\": 26062, \\\"f\\\": 26047, \\\"r\\\": 26062, \\\"5\\\": 21056, \\\"y\\\": 25905, \\\"-\\\": 20037, \\\"2\\\": 17710, \\\"8\\\": 6519, \\\".\\\": 22903, \\\"!\\\": 10546, \\\"1\\\": 20846, \\\":\\\": 5632, \\\"q\\\": 8227, \\\"6\\\": 4413, \\\")\\\": 3513, \\\"/\\\": 10272, \\\"(\\\": 2764, \\\"@\\\": 149, \\\"&\\\": 1803, \\\"7\\\": 5275, \\\"?\\\": 620, \\\"\\\\\\\"\\\": 2121, \\\"\\\\r\\\": 1965, \\\"$\\\": 51, \\\"#\\\": 668, \\\"~\\\": 76, \\\"_\\\": 34, \\\"\\\\\\\\\\\": 82, \\\";\\\": 693, \\\"]\\\": 76, \\\"[\\\": 77, \\\"\\\\u2019\\\": 216, \\\"%\\\": 283, \\\"=\\\": 117, \\\">\\\": 22, \\\"\\\\u00bd\\\": 16, \\\"+\\\": 100, \\\"\\\\u2013\\\": 33, \\\"\\\\u2026\\\": 16, \\\"<\\\": 23, \\\"^\\\": 17, \\\"\\\\u00a9\\\": 3, \\\"\\\\u2014\\\": 44, \\\"\\\\u201d\\\": 67, \\\"\\\\u201c\\\": 64, \\\"\\\\u00e8\\\": 9, \\\"{\\\": 13, \\\"}\\\": 14, \\\"\\\\u2018\\\": 14, \\\"\\\\u00a0\\\": 4, \\\"\\\\u00f4\\\": 1, \\\"\\\\u00be\\\": 1, \\\"\\\\u00b0\\\": 11, \\\"\\\\u00e9\\\": 36, \\\"`\\\": 20, \\\"\\\\u00ae\\\": 13, \\\"\\\\u00fa\\\": 1, \\\"\\\\u00ed\\\": 1, \\\"\\\\u00e4\\\": 3, \\\"\\\\u00ba\\\": 1, \\\"\\\\u00ef\\\": 3, \\\"\\\\u00bf\\\": 3, \\\"\\\\u00fc\\\": 3, \\\"\\\\u00f1\\\": 5, \\\"\\\\u00b4\\\": 2, \\\"\\\\u00e7\\\": 2, \\\"\\\\u00e2\\\": 1, \\\"\\\\u00fb\\\": 1, \\\"|\\\": 3, \\\"\\\\u00bc\\\": 2, \\\"\\\\u00f3\\\": 2, \\\"\\\\u00f6\\\": 2, \\\"\\\\u00ee\\\": 3, \\\"\\\\u00ea\\\": 1, \\\"\\\\t\\\": 1, \\\"\\\\u00eb\\\": 1, \\\"\\\\u00a2\\\": 1, \\\"\\\\u00f9\\\": 1, \\\"\\\\u00e0\\\": 1, \\\"\\\\u2122\\\": 3}\", \"index_docs\": \"{\\\"1\\\": 26062, \\\"71\\\": 332, \\\"61\\\": 7536, \\\"12\\\": 26061, \\\"51\\\": 26062, \\\"28\\\": 24713, \\\"22\\\": 26013, \\\"2\\\": 26062, \\\"14\\\": 26058, \\\"42\\\": 20638, \\\"17\\\": 26061, \\\"48\\\": 26062, \\\"58\\\": 12158, \\\"11\\\": 26062, \\\"18\\\": 26034, \\\"7\\\": 26062, \\\"10\\\": 26061, \\\"34\\\": 23968, \\\"50\\\": 26062, \\\"54\\\": 26062, \\\"35\\\": 26062, \\\"3\\\": 26062, \\\"30\\\": 26062, \\\"16\\\": 26056, \\\"56\\\": 12988, \\\"40\\\": 26062, \\\"55\\\": 10980, \\\"52\\\": 26062, \\\"6\\\": 26062, \\\"31\\\": 26062, \\\"5\\\": 26062, \\\"15\\\": 26054, \\\"41\\\": 26062, \\\"47\\\": 26062, \\\"46\\\": 13270, \\\"21\\\": 26012, \\\"13\\\": 26059, \\\"19\\\": 26043, \\\"24\\\": 26062, \\\"29\\\": 25690, \\\"49\\\": 26062, \\\"23\\\": 26062, \\\"9\\\": 26062, \\\"27\\\": 26062, \\\"4\\\": 26062, \\\"33\\\": 26062, \\\"44\\\": 21103, \\\"53\\\": 26062, \\\"25\\\": 26062, \\\"32\\\": 26062, \\\"39\\\": 26062, \\\"20\\\": 26047, \\\"8\\\": 26062, \\\"43\\\": 21056, \\\"26\\\": 25905, \\\"38\\\": 20037, \\\"45\\\": 17710, \\\"63\\\": 6519, \\\"36\\\": 22903, \\\"57\\\": 10546, \\\"37\\\": 20846, \\\"62\\\": 5632, \\\"60\\\": 8227, \\\"65\\\": 4413, \\\"67\\\": 3513, \\\"59\\\": 10272, \\\"70\\\": 2764, \\\"78\\\": 149, \\\"68\\\": 1803, \\\"64\\\": 5275, \\\"74\\\": 620, \\\"66\\\": 2121, \\\"69\\\": 1965, \\\"88\\\": 51, \\\"72\\\": 668, \\\"80\\\": 76, \\\"81\\\": 34, \\\"82\\\": 82, \\\"73\\\": 693, \\\"84\\\": 76, \\\"83\\\": 77, \\\"75\\\": 216, \\\"76\\\": 283, \\\"77\\\": 117, \\\"91\\\": 22, \\\"94\\\": 16, \\\"79\\\": 100, \\\"90\\\": 33, \\\"96\\\": 16, \\\"92\\\": 23, \\\"93\\\": 17, \\\"106\\\": 3, \\\"87\\\": 44, \\\"85\\\": 67, \\\"86\\\": 64, \\\"103\\\": 9, \\\"98\\\": 13, \\\"97\\\": 14, \\\"100\\\": 14, \\\"104\\\": 4, \\\"120\\\": 1, \\\"121\\\": 1, \\\"101\\\": 11, \\\"89\\\": 36, \\\"95\\\": 20, \\\"99\\\": 13, \\\"114\\\": 1, \\\"122\\\": 1, \\\"107\\\": 3, \\\"123\\\": 1, \\\"108\\\": 3, \\\"109\\\": 3, \\\"110\\\": 3, \\\"105\\\": 5, \\\"115\\\": 2, \\\"116\\\": 2, \\\"124\\\": 1, \\\"117\\\": 1, \\\"102\\\": 3, \\\"111\\\": 2, \\\"118\\\": 2, \\\"119\\\": 2, \\\"112\\\": 3, \\\"125\\\": 1, \\\"126\\\": 1, \\\"127\\\": 1, \\\"128\\\": 1, \\\"129\\\": 1, \\\"130\\\": 1, \\\"113\\\": 3}\", \"index_word\": \"{\\\"1\\\": \\\" \\\", \\\"2\\\": \\\"e\\\", \\\"3\\\": \\\"a\\\", \\\"4\\\": \\\"t\\\", \\\"5\\\": \\\"o\\\", \\\"6\\\": \\\"i\\\", \\\"7\\\": \\\"n\\\", \\\"8\\\": \\\"r\\\", \\\"9\\\": \\\"s\\\", \\\"10\\\": \\\"l\\\", \\\"11\\\": \\\"\\\\n\\\", \\\"12\\\": \\\"d\\\", \\\"13\\\": \\\"u\\\", \\\"14\\\": \\\"c\\\", \\\"15\\\": \\\"h\\\", \\\"16\\\": \\\"g\\\", \\\"17\\\": \\\"m\\\", \\\"18\\\": \\\"p\\\", \\\"19\\\": \\\"b\\\", \\\"20\\\": \\\"f\\\", \\\"21\\\": \\\"w\\\", \\\"22\\\": \\\"k\\\", \\\"23\\\": \\\"\\\\u25aa\\\", \\\"24\\\": \\\"\\\\ufe0e\\\", \\\"25\\\": \\\"\\\\u2022\\\", \\\"26\\\": \\\"y\\\", \\\"27\\\": \\\"I\\\", \\\"28\\\": \\\",\\\", \\\"29\\\": \\\"v\\\", \\\"30\\\": \\\"T\\\", \\\"31\\\": \\\"N\\\", \\\"32\\\": \\\"E\\\", \\\"33\\\": \\\"S\\\", \\\"34\\\": \\\"x\\\", \\\"35\\\": \\\"R\\\", \\\"36\\\": \\\".\\\", \\\"37\\\": \\\"1\\\", \\\"38\\\": \\\"-\\\", \\\"39\\\": \\\"D\\\", \\\"40\\\": \\\"C\\\", \\\"41\\\": \\\"O\\\", \\\"42\\\": \\\"0\\\", \\\"43\\\": \\\"5\\\", \\\"44\\\": \\\"3\\\", \\\"45\\\": \\\"2\\\", \\\"46\\\": \\\"'\\\", \\\"47\\\": \\\"\\\\ud83d\\\\udccc\\\", \\\"48\\\": \\\"L\\\", \\\"49\\\": \\\"\\\\ud83d\\\\udc40\\\", \\\"50\\\": \\\"P\\\", \\\"51\\\": \\\"\\\\ud83c\\\\udf52\\\", \\\"52\\\": \\\"G\\\", \\\"53\\\": \\\"\\\\ud83d\\\\udcdd\\\", \\\"54\\\": \\\"U\\\", \\\"55\\\": \\\"z\\\", \\\"56\\\": \\\"j\\\", \\\"57\\\": \\\"!\\\", \\\"58\\\": \\\"4\\\", \\\"59\\\": \\\"/\\\", \\\"60\\\": \\\"q\\\", \\\"61\\\": \\\"9\\\", \\\"62\\\": \\\":\\\", \\\"63\\\": \\\"8\\\", \\\"64\\\": \\\"7\\\", \\\"65\\\": \\\"6\\\", \\\"66\\\": \\\"\\\\\\\"\\\", \\\"67\\\": \\\")\\\", \\\"68\\\": \\\"&\\\", \\\"69\\\": \\\"\\\\r\\\", \\\"70\\\": \\\"(\\\", \\\"71\\\": \\\"*\\\", \\\"72\\\": \\\"#\\\", \\\"73\\\": \\\";\\\", \\\"74\\\": \\\"?\\\", \\\"75\\\": \\\"\\\\u2019\\\", \\\"76\\\": \\\"%\\\", \\\"77\\\": \\\"=\\\", \\\"78\\\": \\\"@\\\", \\\"79\\\": \\\"+\\\", \\\"80\\\": \\\"~\\\", \\\"81\\\": \\\"_\\\", \\\"82\\\": \\\"\\\\\\\\\\\", \\\"83\\\": \\\"[\\\", \\\"84\\\": \\\"]\\\", \\\"85\\\": \\\"\\\\u201d\\\", \\\"86\\\": \\\"\\\\u201c\\\", \\\"87\\\": \\\"\\\\u2014\\\", \\\"88\\\": \\\"$\\\", \\\"89\\\": \\\"\\\\u00e9\\\", \\\"90\\\": \\\"\\\\u2013\\\", \\\"91\\\": \\\">\\\", \\\"92\\\": \\\"<\\\", \\\"93\\\": \\\"^\\\", \\\"94\\\": \\\"\\\\u00bd\\\", \\\"95\\\": \\\"`\\\", \\\"96\\\": \\\"\\\\u2026\\\", \\\"97\\\": \\\"}\\\", \\\"98\\\": \\\"{\\\", \\\"99\\\": \\\"\\\\u00ae\\\", \\\"100\\\": \\\"\\\\u2018\\\", \\\"101\\\": \\\"\\\\u00b0\\\", \\\"102\\\": \\\"|\\\", \\\"103\\\": \\\"\\\\u00e8\\\", \\\"104\\\": \\\"\\\\u00a0\\\", \\\"105\\\": \\\"\\\\u00f1\\\", \\\"106\\\": \\\"\\\\u00a9\\\", \\\"107\\\": \\\"\\\\u00e4\\\", \\\"108\\\": \\\"\\\\u00ef\\\", \\\"109\\\": \\\"\\\\u00bf\\\", \\\"110\\\": \\\"\\\\u00fc\\\", \\\"111\\\": \\\"\\\\u00bc\\\", \\\"112\\\": \\\"\\\\u00ee\\\", \\\"113\\\": \\\"\\\\u2122\\\", \\\"114\\\": \\\"\\\\u00fa\\\", \\\"115\\\": \\\"\\\\u00b4\\\", \\\"116\\\": \\\"\\\\u00e7\\\", \\\"117\\\": \\\"\\\\u00fb\\\", \\\"118\\\": \\\"\\\\u00f3\\\", \\\"119\\\": \\\"\\\\u00f6\\\", \\\"120\\\": \\\"\\\\u00f4\\\", \\\"121\\\": \\\"\\\\u00be\\\", \\\"122\\\": \\\"\\\\u00ed\\\", \\\"123\\\": \\\"\\\\u00ba\\\", \\\"124\\\": \\\"\\\\u00e2\\\", \\\"125\\\": \\\"\\\\u00ea\\\", \\\"126\\\": \\\"\\\\t\\\", \\\"127\\\": \\\"\\\\u00eb\\\", \\\"128\\\": \\\"\\\\u00a2\\\", \\\"129\\\": \\\"\\\\u00f9\\\", \\\"130\\\": \\\"\\\\u00e0\\\"}\", \"word_index\": \"{\\\" \\\": 1, \\\"e\\\": 2, \\\"a\\\": 3, \\\"t\\\": 4, \\\"o\\\": 5, \\\"i\\\": 6, \\\"n\\\": 7, \\\"r\\\": 8, \\\"s\\\": 9, \\\"l\\\": 10, \\\"\\\\n\\\": 11, \\\"d\\\": 12, \\\"u\\\": 13, \\\"c\\\": 14, \\\"h\\\": 15, \\\"g\\\": 16, \\\"m\\\": 17, \\\"p\\\": 18, \\\"b\\\": 19, \\\"f\\\": 20, \\\"w\\\": 21, \\\"k\\\": 22, \\\"\\\\u25aa\\\": 23, \\\"\\\\ufe0e\\\": 24, \\\"\\\\u2022\\\": 25, \\\"y\\\": 26, \\\"I\\\": 27, \\\",\\\": 28, \\\"v\\\": 29, \\\"T\\\": 30, \\\"N\\\": 31, \\\"E\\\": 32, \\\"S\\\": 33, \\\"x\\\": 34, \\\"R\\\": 35, \\\".\\\": 36, \\\"1\\\": 37, \\\"-\\\": 38, \\\"D\\\": 39, \\\"C\\\": 40, \\\"O\\\": 41, \\\"0\\\": 42, \\\"5\\\": 43, \\\"3\\\": 44, \\\"2\\\": 45, \\\"'\\\": 46, \\\"\\\\ud83d\\\\udccc\\\": 47, \\\"L\\\": 48, \\\"\\\\ud83d\\\\udc40\\\": 49, \\\"P\\\": 50, \\\"\\\\ud83c\\\\udf52\\\": 51, \\\"G\\\": 52, \\\"\\\\ud83d\\\\udcdd\\\": 53, \\\"U\\\": 54, \\\"z\\\": 55, \\\"j\\\": 56, \\\"!\\\": 57, \\\"4\\\": 58, \\\"/\\\": 59, \\\"q\\\": 60, \\\"9\\\": 61, \\\":\\\": 62, \\\"8\\\": 63, \\\"7\\\": 64, \\\"6\\\": 65, \\\"\\\\\\\"\\\": 66, \\\")\\\": 67, \\\"&\\\": 68, \\\"\\\\r\\\": 69, \\\"(\\\": 70, \\\"*\\\": 71, \\\"#\\\": 72, \\\";\\\": 73, \\\"?\\\": 74, \\\"\\\\u2019\\\": 75, \\\"%\\\": 76, \\\"=\\\": 77, \\\"@\\\": 78, \\\"+\\\": 79, \\\"~\\\": 80, \\\"_\\\": 81, \\\"\\\\\\\\\\\": 82, \\\"[\\\": 83, \\\"]\\\": 84, \\\"\\\\u201d\\\": 85, \\\"\\\\u201c\\\": 86, \\\"\\\\u2014\\\": 87, \\\"$\\\": 88, \\\"\\\\u00e9\\\": 89, \\\"\\\\u2013\\\": 90, \\\">\\\": 91, \\\"<\\\": 92, \\\"^\\\": 93, \\\"\\\\u00bd\\\": 94, \\\"`\\\": 95, \\\"\\\\u2026\\\": 96, \\\"}\\\": 97, \\\"{\\\": 98, \\\"\\\\u00ae\\\": 99, \\\"\\\\u2018\\\": 100, \\\"\\\\u00b0\\\": 101, \\\"|\\\": 102, \\\"\\\\u00e8\\\": 103, \\\"\\\\u00a0\\\": 104, \\\"\\\\u00f1\\\": 105, \\\"\\\\u00a9\\\": 106, \\\"\\\\u00e4\\\": 107, \\\"\\\\u00ef\\\": 108, \\\"\\\\u00bf\\\": 109, \\\"\\\\u00fc\\\": 110, \\\"\\\\u00bc\\\": 111, \\\"\\\\u00ee\\\": 112, \\\"\\\\u2122\\\": 113, \\\"\\\\u00fa\\\": 114, \\\"\\\\u00b4\\\": 115, \\\"\\\\u00e7\\\": 116, \\\"\\\\u00fb\\\": 117, \\\"\\\\u00f3\\\": 118, \\\"\\\\u00f6\\\": 119, \\\"\\\\u00f4\\\": 120, \\\"\\\\u00be\\\": 121, \\\"\\\\u00ed\\\": 122, \\\"\\\\u00ba\\\": 123, \\\"\\\\u00e2\\\": 124, \\\"\\\\u00ea\\\": 125, \\\"\\\\t\\\": 126, \\\"\\\\u00eb\\\": 127, \\\"\\\\u00a2\\\": 128, \\\"\\\\u00f9\\\": 129, \\\"\\\\u00e0\\\": 130}\"}}\n" - ] - } - ] - }, - { - "cell_type": "code", - "source": [ - "import pickle\n", - "\n", - "# saving\n", - "with open('tokenizer.pickle', 'wb') as handle:\n", - " pickle.dump(tokenizer, handle, protocol=pickle.HIGHEST_PROTOCOL)\n", - "\n", - "# loading\n", - "with open('tokenizer.pickle', 'rb') as handle:\n", - " tokenizer = pickle.load(handle)" - ], - "metadata": { - "id": "69NQ2Mjra-K2" - }, - "execution_count": 45, - "outputs": [] - }, - { - "cell_type": "code", - "execution_count": 46, - "metadata": { - "colab": { - "base_uri": "https://localhost:8080/" - }, - "id": "VVC4sL-ASjoc", - "outputId": "65e00766-7b13-4373-b7b0-550212c273b8" - }, - "outputs": [ - { - "output_type": "stream", - "name": "stdout", - "text": [ - "VOCABULARY_SIZE: 131\n" - ] - } - ], - "source": [ - "VOCABULARY_SIZE = len(tokenizer.word_counts) + 1\n", - "\n", - "print('VOCABULARY_SIZE: ', VOCABULARY_SIZE)" - ] - }, - { - "cell_type": "code", - "execution_count": 47, - "metadata": { - "colab": { - "base_uri": "https://localhost:8080/" - }, - "id": "owf0HlFBStA6", - "outputId": "fb461efc-0906-474c-a80c-2e05a9fbbe93" - }, - "outputs": [ - { - "output_type": "stream", - "name": "stdout", - "text": [ - "Vectorized dataset size 26062\n" - ] - } - ], - "source": [ - "dataset_vectorized = tokenizer.texts_to_sequences(dataset_filtered)\n", - "\n", - "print('Vectorized dataset size', len(dataset_vectorized))" - ] - }, - { - "cell_type": "code", - "execution_count": 48, - "metadata": { - "colab": { - "base_uri": "https://localhost:8080/" - }, - "id": "SK1rOrZlS6Xc", - "outputId": "475f77ca-f6d8-4a05-ca36-2e3319d8de34" - }, - "outputs": [ - { - "output_type": "stream", - "name": "stdout", - "text": [ - "[47, 1, 30, 27, 30, 48, 32, 11, 11, 19] ...\n" - ] - } - ], - "source": [ - "print(dataset_vectorized[0][:10], '...')" - ] - }, - { - "cell_type": "code", - "execution_count": 49, - "metadata": { - "colab": { - "base_uri": "https://localhost:8080/" - }, - "id": "bCtCEbZHTKPG", - "outputId": "c4438a80-1f0c-4065-e8bd-86465e7af8bf" - }, - "outputs": [ - { - "output_type": "stream", - "name": "stdout", - "text": [ - "๐Ÿ“Œ T I T L E \n", - " \n", - " b e a t t h i s b a n a n a b r e a d \n", - " \n", - " ๐Ÿ‘€ D E S C R I P T I O N \n", - " \n", - " f r o m a n n h o d g m a n ' s \n", - " \n", - " ๐Ÿ’ I N G R E D I E N T S \n", - " \n", - " โ€ข s u g a r \n", - " โ€ข u n s a l t e d b u t t e r \n", - " โ€ข b a n a n a s \n", - " โ€ข e g g s \n", - " โ€ข f r e s h l e m o n j u i c e \n", - " โ€ข o r a n g e r i n d \n", - " โ€ข c a k e f l o u r \n", - " โ€ข b a k i n g s o d a \n", - " โ€ข s a l t \n", - " \n", - " ๐Ÿ“ I N S T R U C T I O N S \n", - " \n", - " โ–ช ๏ธŽ p r e h e a t o v e n t o 3 5 0 d e g r e e s \n", - " โ–ช ๏ธŽ b u t t e r t w o 9 x 5 ' l o a f p a n s \n", - " โ–ช ๏ธŽ c r e a m t h e s u g a r a n d t h e b u t t e r u n t i l l i g h t a n d w h i p p e d \n", - " โ–ช ๏ธŽ a d d t h e b a n a n a s , e g g s , l e m o n j u i c e , o r a n g e r i n d \n", - " โ–ช ๏ธŽ b e a t u n t i l b l e n d e d u n i f o r m l y \n", - " โ–ช ๏ธŽ b e p a t i e n t , a n d b e a t u n t i l t h e b a n a n a l u m p s a r e g o n e \n", - " โ–ช ๏ธŽ s i f t t h e d r y i n g r e d i e n t s t o g e t h e r \n", - " โ–ช ๏ธŽ f o l d l i g h t l y a n d t h o r o u g h l y i n t o t h e b a n a n a m i x t u r e \n", - " โ–ช ๏ธŽ p o u r t h e b a t t e r i n t o p r e p a r e d l o a f p a n s \n", - " โ–ช ๏ธŽ b a k e f o r 4 5 t o 5 5 m i n u t e s , u n t i l t h e l o a v e s a r e f i r m i n t h e m i d d l e a n d t h e e d g e s b e g i n t o p u l l a w a y f r o m t h e p a n s \n", - " โ–ช ๏ธŽ c o o l t h e l o a v e s o n r a c k s f o r 3 0 m i n u t e s b e f o r e r e m o v i n g f r o m t h e p a n s \n", - " โ–ช ๏ธŽ f r e e z e s w e l l \n", - "\n" - ] - } - ], - "source": [ - "def recipe_sequence_to_string(recipe_sequence):\n", - " recipe_stringified = tokenizer.sequences_to_texts([recipe_sequence])[0]\n", - " print(recipe_stringified)\n", - "\n", - "recipe_sequence_to_string(dataset_vectorized[0])" - ] - }, - { - "cell_type": "code", - "execution_count": 50, - "metadata": { - "colab": { - "base_uri": "https://localhost:8080/" - }, - "id": "9wOacegWTcqD", - "outputId": "0e4f41cb-f15d-4957-cc4d-b67e64d81f7f" - }, - "outputs": [ - { - "output_type": "stream", - "name": "stdout", - "text": [ - "Recipe #1 length: 816\n", - "Recipe #2 length: 332\n", - "Recipe #3 length: 1489\n", - "Recipe #4 length: 671\n", - "Recipe #5 length: 1005\n", - "Recipe #6 length: 1207\n", - "Recipe #7 length: 737\n", - "Recipe #8 length: 925\n", - "Recipe #9 length: 994\n", - "Recipe #10 length: 813\n" - ] - } - ], - "source": [ - "for recipe_index, recipe in enumerate(dataset_vectorized[:10]):\n", - " print('Recipe #{} length: {}'.format(recipe_index + 1, len(recipe)))" - ] - }, - { - "cell_type": "code", - "execution_count": 51, - "metadata": { - "colab": { - "base_uri": "https://localhost:8080/" - }, - "id": "dYUJE1GnTltc", - "outputId": "46629ade-12be-43c1-a457-2b26684ca868" - }, - "outputs": [ - { - "output_type": "stream", - "name": "stdout", - "text": [ - "Recipe #0 length: 2001\n", - "Recipe #1 length: 2001\n", - "Recipe #2 length: 2001\n", - "Recipe #3 length: 2001\n", - "Recipe #4 length: 2001\n", - "Recipe #5 length: 2001\n", - "Recipe #6 length: 2001\n", - "Recipe #7 length: 2001\n", - "Recipe #8 length: 2001\n", - "Recipe #9 length: 2001\n" - ] - } - ], - "source": [ - "dataset_vectorized_padded_without_stops = tf.keras.preprocessing.sequence.pad_sequences(\n", - " dataset_vectorized,\n", - " padding='post',\n", - " truncating='post',\n", - " # We use -1 here and +1 in the next step to make sure\n", - " # that all recipes will have at least 1 stops sign at the end,\n", - " # since each sequence will be shifted and truncated afterwards\n", - " # (to generate X and Y sequences).\n", - " maxlen=MAX_RECIPE_LENGTH-1,\n", - " value=tokenizer.texts_to_sequences([STOP_SIGN])[0]\n", - ")\n", - "\n", - "dataset_vectorized_padded = tf.keras.preprocessing.sequence.pad_sequences(\n", - " dataset_vectorized_padded_without_stops,\n", - " padding='post',\n", - " truncating='post',\n", - " maxlen=MAX_RECIPE_LENGTH+1,\n", - " value=tokenizer.texts_to_sequences([STOP_SIGN])[0]\n", - ")\n", - "\n", - "for recipe_index, recipe in enumerate(dataset_vectorized_padded[:10]):\n", - " print('Recipe #{} length: {}'.format(recipe_index, len(recipe)))" - ] - }, - { - "cell_type": "code", - "execution_count": 52, - "metadata": { - "colab": { - "base_uri": "https://localhost:8080/" - }, - "id": "OZo1r3EcTrJ2", - "outputId": "88370b08-a7aa-4701-ebff-53f88f655f41" - }, - "outputs": [ - { - "output_type": "stream", - "name": "stdout", - "text": [ - "๐Ÿ“Œ T I T L E \n", - " \n", - " b e a t t h i s b a n a n a b r e a d \n", - " \n", - " ๐Ÿ‘€ D E S C R I P T I O N \n", - " \n", - " f r o m a n n h o d g m a n ' s \n", - " \n", - " ๐Ÿ’ I N G R E D I E N T S \n", - " \n", - " โ€ข s u g a r \n", - " โ€ข u n s a l t e d b u t t e r \n", - " โ€ข b a n a n a s \n", - " โ€ข e g g s \n", - " โ€ข f r e s h l e m o n j u i c e \n", - " โ€ข o r a n g e r i n d \n", - " โ€ข c a k e f l o u r \n", - " โ€ข b a k i n g s o d a \n", - " โ€ข s a l t \n", - " \n", - " ๐Ÿ“ I N S T R U C T I O N S \n", - " \n", - " โ–ช ๏ธŽ p r e h e a t o v e n t o 3 5 0 d e g r e e s \n", - " โ–ช ๏ธŽ b u t t e r t w o 9 x 5 ' l o a f p a n s \n", - " โ–ช ๏ธŽ c r e a m t h e s u g a r a n d t h e b u t t e r u n t i l l i g h t a n d w h i p p e d \n", - " โ–ช ๏ธŽ a d d t h e b a n a n a s , e g g s , l e m o n j u i c e , o r a n g e r i n d \n", - " โ–ช ๏ธŽ b e a t u n t i l b l e n d e d u n i f o r m l y \n", - " โ–ช ๏ธŽ b e p a t i e n t , a n d b e a t u n t i l t h e b a n a n a l u m p s a r e g o n e \n", - " โ–ช ๏ธŽ s i f t t h e d r y i n g r e d i e n t s t o g e t h e r \n", - " โ–ช ๏ธŽ f o l d l i g h t l y a n d t h o r o u g h l y i n t o t h e b a n a n a m i x t u r e \n", - " โ–ช ๏ธŽ p o u r t h e b a t t e r i n t o p r e p a r e d l o a f p a n s \n", - " โ–ช ๏ธŽ b a k e f o r 4 5 t o 5 5 m i n u t e s , u n t i l t h e l o a v e s a r e f i r m i n t h e m i d d l e a n d t h e e d g e s b e g i n t o p u l l a w a y f r o m t h e p a n s \n", - " โ–ช ๏ธŽ c o o l t h e l o a v e s o n r a c k s f o r 3 0 m i n u t e s b e f o r e r e m o v i n g f r o m t h e p a n s \n", - " โ–ช ๏ธŽ f r e e z e s w e l l \nn" - ] - } - ], - "source": [ - "recipe_sequence_to_string(dataset_vectorized_padded[0])" - ] - }, - { - "cell_type": "code", - "execution_count": 53, - "metadata": { - "colab": { - "base_uri": "https://localhost:8080/" - }, - "id": "v2IvFHDcTwpQ", - "outputId": "63569fe8-2e5c-4c98-ef7a-0d5edd88e830" - }, - "outputs": [ - { - "output_type": "stream", - "name": "stdout", - "text": [ - "\n" - ] - } - ], - "source": [ - "dataset = tf.data.Dataset.from_tensor_slices(dataset_vectorized_padded)\n", - "\n", - "print(dataset)" - ] - }, - { - "cell_type": "code", - "execution_count": 54, - "metadata": { - "colab": { - "base_uri": "https://localhost:8080/" - }, - "id": "ZDGf72uaUbj8", - "outputId": "f7da740f-6111-45be-aee1-3e89bdce388b" - }, - "outputs": [ - { - "output_type": "stream", - "name": "stdout", - "text": [ - "Raw recipe:\n", - " [47 1 30 ... 71 71 71] \n", - "\n", - "\n", - "\n", - "Stringified recipe:\n", - "\n", - "๐Ÿ“Œ T I T L E \n", - " \n", - " b e a t t h i s b a n a n a b r e a d \n", - " \n", - " ๐Ÿ‘€ D E S C R I P T I O N \n", - " \n", - " f r o m a n n h o d g m a n ' s \n", - " \n", - " ๐Ÿ’ I N G R E D I E N T S \n", - " \n", - " โ€ข s u g a r \n", - " โ€ข u n s a l t e d b u t t e r \n", - " โ€ข b a n a n a s \n", - " โ€ข e g g s \n", - " โ€ข f r e s h l e m o n j u i c e \n", - " โ€ข o r a n g e r i n d \n", - " โ€ข c a k e f l o u r \n", - " โ€ข b a k i n g s o d a \n", - " โ€ข s a l t \n", - " \n", - " ๐Ÿ“ I N S T R U C T I O N S \n", - " \n", - " โ–ช ๏ธŽ p r e h e a t o v e n t o 3 5 0 d e g r e e s \n", - " โ–ช ๏ธŽ b u t t e r t w o 9 x 5 ' l o a f p a n s \n", - " โ–ช ๏ธŽ c r e a m t h e s u g a r a n d t h e b u t t e r u n t i l l i g h t a n d w h i p p e d \n", - " โ–ช ๏ธŽ a d d t h e b a n a n a s , e g g s , l e m o n j u i c e , o r a n g e r i n d \n", - " โ–ช ๏ธŽ b e a t u n t i l b l e n d e d u n i f o r m l y \n", - " โ–ช ๏ธŽ b e p a t i e n t , a n d b e a t u n t i l t h e b a n a n a l u m p s a r e g o n e \n", - " โ–ช ๏ธŽ s i f t t h e d r y i n g r e d i e n t s t o g e t h e r \n", - " โ–ช ๏ธŽ f o l d l i g h t l y a n d t h o r o u g h l y i n t o t h e b a n a n a m i x t u r e \n", - " โ–ช ๏ธŽ p o u r t h e b a t t e r i n t o p r e p a r e d l o a f p a n s \n", - " โ–ช ๏ธŽ b a k e f o r 4 5 t o 5 5 m i n u t e s , u n t i l t h e l o a v e s a r e f i r m i n t h e m i d d l e a n d t h e e d g e s b e g i n t o p u l l a w a y f r o m t h e p a n s \n", - " โ–ช ๏ธŽ c o o l t h e l o a v e s o n r a c k s f o r 3 0 m i n u t e s b e f o r e r e m o v i n g f r o m t h e p a n s \n", - " โ–ช ๏ธŽ f r e e z e s w e l l \nn" - ] - } - ], - "source": [ - "for recipe in dataset.take(1):\n", - " print('Raw recipe:\\n', recipe.numpy(), '\\n\\n\\n')\n", - " print('Stringified recipe:\\n')\n", - " recipe_sequence_to_string(recipe.numpy())" - ] - }, - { - "cell_type": "markdown", - "metadata": { - "id": "Ca9lz-qSVj8I" - }, - "source": [ - "### Split examples" - ] - }, - { - "cell_type": "code", - "execution_count": 55, - "metadata": { - "colab": { - "base_uri": "https://localhost:8080/" - }, - "id": "RxrfOWemUjTW", - "outputId": "e3cf95aa-7f0f-42b7-9219-52ca9d32c1c4" - }, - "outputs": [ - { - "output_type": "stream", - "name": "stdout", - "text": [ - "\n" - ] - } - ], - "source": [ - "def split_input_target(recipe):\n", - " input_text = recipe[:-1]\n", - " target_text = recipe[1:]\n", - " \n", - " return input_text, target_text\n", - "\n", - "dataset_targeted = dataset.map(split_input_target)\n", - "\n", - "print(dataset_targeted)" - ] - }, - { - "cell_type": "code", - "execution_count": 56, - "metadata": { - "colab": { - "base_uri": "https://localhost:8080/" - }, - "id": "WUNE9gOOVvMB", - "outputId": "8d905dfe-e19f-4887-b046-ea86d8a0ee8b" - }, - "outputs": [ - { - "output_type": "stream", - "name": "stdout", - "text": [ - "Input sequence size: 2000\n", - "Target sequence size: 2000\n", - "\n", - "Input: '๐Ÿ“Œ T I T L E \\n \\n b e a t t h i s b a n a n a b r e a d \\n \\n ๐Ÿ‘€ D E S C R I P T I O N \\n \\n f'\n", - "Target: ' T I T L E \\n \\n b e a t t h i s b a n a n a b r e a d \\n \\n ๐Ÿ‘€ D E S C R I P T I O N \\n \\n f r'\n" - ] - } - ], - "source": [ - "for input_example, target_example in dataset_targeted.take(1):\n", - " print('Input sequence size:', repr(len(input_example.numpy())))\n", - " print('Target sequence size:', repr(len(target_example.numpy())))\n", - " print()\n", - " \n", - " input_stringified = tokenizer.sequences_to_texts([input_example.numpy()[:50]])[0]\n", - " target_stringified = tokenizer.sequences_to_texts([target_example.numpy()[:50]])[0]\n", - " \n", - " print('Input: ', repr(''.join(input_stringified)))\n", - " print('Target: ', repr(''.join(target_stringified)))" - ] - }, - { - "cell_type": "code", - "execution_count": 57, - "metadata": { - "colab": { - "base_uri": "https://localhost:8080/" - }, - "id": "-yVH6M1YVyUc", - "outputId": "32cc5527-e9dd-4624-90c7-8eabae94cc4b" - }, - "outputs": [ - { - "output_type": "stream", - "name": "stdout", - "text": [ - "TOTAL_RECIPES_NUM: 26062\n", - "MAX_RECIPE_LENGTH: 2000\n", - "VOCABULARY_SIZE: 131\n" - ] - } - ], - "source": [ - "TOTAL_RECIPES_NUM = len(dataset_filtered)\n", - "print('TOTAL_RECIPES_NUM: ', TOTAL_RECIPES_NUM)\n", - "print('MAX_RECIPE_LENGTH: ', MAX_RECIPE_LENGTH)\n", - "print('VOCABULARY_SIZE: ', VOCABULARY_SIZE)" - ] - }, - { - "cell_type": "code", - "execution_count": 58, - "metadata": { - "colab": { - "base_uri": "https://localhost:8080/" - }, - "id": "pV-8JjXRV-zW", - "outputId": "c326aa98-bd19-4d76-b55e-9d6d695e6bff" - }, - "outputs": [ - { - "output_type": "stream", - "name": "stdout", - "text": [ - "\n" - ] - } - ], - "source": [ - "print(dataset_targeted)" - ] - }, - { - "cell_type": "code", - "execution_count": 59, - "metadata": { - "colab": { - "base_uri": "https://localhost:8080/" - }, - "id": "n48zzY_CWmND", - "outputId": "fc3d6eb3-d522-42db-85e7-318e3ab567e1" - }, - "outputs": [ - { - "output_type": "stream", - "name": "stdout", - "text": [ - "\n" - ] - } - ], - "source": [ - "# Batch size\n", - "BATCH_SIZE = 64\n", - "SHUFFLE_BUFFER_SIZE = 1000\n", - "dataset_train = dataset_targeted.shuffle(SHUFFLE_BUFFER_SIZE).batch(BATCH_SIZE, drop_remainder=True).repeat()\n", - "\n", - "print(dataset_train)\n" - ] - }, - { - "cell_type": "code", - "execution_count": 60, - "metadata": { - "colab": { - "base_uri": "https://localhost:8080/" - }, - "id": "6LBr0YjEcFq1", - "outputId": "01d62649-f675-4ad4-d61b-b1bbce2e7f78" - }, - "outputs": [ - { - "output_type": "stream", - "name": "stdout", - "text": [ - "1st batch: input_text: tf.Tensor(\n", - "[[47 1 30 ... 71 71 71]\n", - " [47 1 30 ... 71 71 71]\n", - " [47 1 30 ... 71 71 71]\n", - " ...\n", - " [47 1 30 ... 71 71 71]\n", - " [47 1 30 ... 71 71 71]\n", - " [47 1 30 ... 71 71 71]], shape=(64, 2000), dtype=int32)\n", - "\n", - "1st batch: target_text: tf.Tensor(\n", - "[[ 1 30 27 ... 71 71 71]\n", - " [ 1 30 27 ... 71 71 71]\n", - " [ 1 30 27 ... 71 71 71]\n", - " ...\n", - " [ 1 30 27 ... 71 71 71]\n", - " [ 1 30 27 ... 71 71 71]\n", - " [ 1 30 27 ... 71 71 71]], shape=(64, 2000), dtype=int32)\n" - ] - } - ], - "source": [ - "for input_text, target_text in dataset_train.take(1):\n", - " print('1st batch: input_text:', input_text)\n", - " print()\n", - " print('1st batch: target_text:', target_text)" - ] - }, - { - "cell_type": "markdown", - "metadata": { - "id": "wnhPp7Ovc4Lm" - }, - "source": [ - "### Build a model" - ] - }, - { - "cell_type": "code", - "execution_count": 61, - "metadata": { - "colab": { - "base_uri": "https://localhost:8080/" - }, - "id": "7llvr6nHcytV", - "outputId": "34558462-c918-452e-dabe-4d97192dada5" - }, - "outputs": [ - { - "output_type": "stream", - "name": "stdout", - "text": [ - "Model: \"sequential\"\n", - "_________________________________________________________________\n", - " Layer (type) Output Shape Param # \n", - "=================================================================\n", - " embedding (Embedding) (64, None, 256) 33536 \n", - " \n", - " lstm (LSTM) (64, None, 1024) 5246976 \n", - " \n", - " dense (Dense) (64, None, 131) 134275 \n", - " \n", - "=================================================================\n", - "Total params: 5,414,787\n", - "Trainable params: 5,414,787\n", - "Non-trainable params: 0\n", - "_________________________________________________________________\n" - ] - } - ], - "source": [ - "def build_model(vocab_size, embedding_dim, rnn_units, batch_size):\n", - " model = tf.keras.models.Sequential()\n", - "\n", - " model.add(tf.keras.layers.Embedding(\n", - " input_dim=vocab_size,\n", - " output_dim=embedding_dim,\n", - " batch_input_shape=[batch_size, None]\n", - " ))\n", - "\n", - " model.add(tf.keras.layers.LSTM(\n", - " units=rnn_units,\n", - " return_sequences=True,\n", - " stateful=True,\n", - " recurrent_initializer=tf.keras.initializers.GlorotNormal()\n", - " ))\n", - "\n", - " model.add(tf.keras.layers.Dense(vocab_size))\n", - " \n", - " return model\n", - "\n", - "model = build_model(\n", - " vocab_size=VOCABULARY_SIZE,\n", - " embedding_dim=256,\n", - " rnn_units=1024,\n", - " batch_size=BATCH_SIZE\n", - ")\n", - "\n", - "model.summary()" - ] - }, - { - "cell_type": "code", - "execution_count": 62, - "metadata": { - "colab": { - "base_uri": "https://localhost:8080/", - "height": 54 - }, - "id": "1h53sBethzPo", - "outputId": "0e5b0b39-ba32-49f5-ce6c-0facefd8cab4" - }, - "outputs": [ - { - "output_type": "execute_result", - "data": { - "image/png": "\n", - "text/plain": [ - "" - ] - }, - "metadata": {}, - "execution_count": 62 - } - ], - "source": [ - "tf.keras.utils.plot_model(\n", - " model,\n", - " show_shapes=True,\n", - " show_layer_names=True,\n", - " to_file='model.png'\n", - ")" - ] - }, - { - "cell_type": "markdown", - "metadata": { - "id": "iCToSc7GiliO" - }, - "source": [ - "### Model before training " - ] - }, - { - "cell_type": "code", - "execution_count": 63, - "metadata": { - "colab": { - "base_uri": "https://localhost:8080/" - }, - "id": "2SBhYLmUiEir", - "outputId": "32e1549c-c4ba-4485-9a23-fa0f59646a3d" - }, - "outputs": [ - { - "output_type": "stream", - "name": "stdout", - "text": [ - "(64, 2000, 131) # (batch_size, sequence_length, vocab_size)\n" - ] - } - ], - "source": [ - "for input_example_batch, target_example_batch in dataset_train.take(1):\n", - " example_batch_predictions = model(input_example_batch)\n", - " print(example_batch_predictions.shape, \"# (batch_size, sequence_length, vocab_size)\")" - ] - }, - { - "cell_type": "code", - "execution_count": 64, - "metadata": { - "colab": { - "base_uri": "https://localhost:8080/" - }, - "id": "5TabRxtDi7fU", - "outputId": "a59afcdc-c3af-4a6d-b6af-198bbca5ce7b" - }, - "outputs": [ - { - "output_type": "stream", - "name": "stdout", - "text": [ - "Prediction for the 1st letter of the batch 1st sequense:\n", - "tf.Tensor(\n", - "[-1.0675509e-03 2.4121299e-03 -3.4268999e-03 -9.4102332e-03\n", - " 3.8642397e-03 2.8061401e-03 7.9052253e-03 -5.2992580e-03\n", - " -5.5068033e-04 -1.3768519e-03 2.5998771e-03 -1.6544745e-03\n", - " 9.5341755e-03 -6.5846457e-03 5.3987121e-03 1.1021262e-03\n", - " 5.5060796e-03 -1.6169570e-03 -1.0811265e-03 -1.9410717e-03\n", - " 1.8350012e-03 -6.6040002e-04 2.4442264e-04 5.1552351e-03\n", - " -4.3791373e-05 -2.7348739e-03 3.8417561e-03 2.1773963e-03\n", - " -2.1101839e-03 1.9899786e-03 -8.0218614e-04 4.5034452e-03\n", - " -6.6260784e-04 2.6028315e-03 6.5406715e-04 -1.8726061e-03\n", - " 1.0012863e-03 -3.2721364e-03 7.5873071e-03 -2.9868486e-03\n", - " -1.0677545e-04 -1.9733279e-03 3.6382321e-03 8.2459439e-05\n", - " -5.2328208e-03 -1.7041726e-03 2.1820085e-03 -5.8736606e-03\n", - " -1.1217683e-03 3.1163255e-03 3.8048404e-03 3.6802294e-03\n", - " 2.2023141e-03 -5.0572255e-03 3.6094848e-03 1.4169712e-03\n", - " 2.6842880e-03 5.5694389e-03 4.6234187e-03 1.6224679e-03\n", - " 4.0268209e-03 -7.0692124e-03 -2.0585626e-03 -3.0523897e-03\n", - " 2.0108016e-03 -2.7215972e-03 1.4773710e-03 3.2362491e-03\n", - " 4.5230016e-03 2.7109061e-03 -5.5058050e-04 5.9013404e-03\n", - " -2.9318572e-03 -4.7815731e-03 -4.3841619e-03 -2.8076277e-03\n", - " -4.4378904e-03 5.6000329e-03 -7.2943093e-04 -2.8678033e-04\n", - " 3.7040499e-03 -3.6899294e-03 8.2457131e-03 -1.4448026e-03\n", - " -5.2399593e-03 8.9364068e-04 -4.7255959e-03 -9.9287822e-04\n", - " -5.8756676e-04 -3.6809328e-03 8.9325383e-04 -2.2902261e-03\n", - " 3.8798158e-03 -1.0823107e-03 2.2230127e-03 -4.9522863e-04\n", - " 9.3542715e-04 1.1118610e-03 1.9911951e-03 3.3677253e-03\n", - " -3.7499142e-03 9.7308145e-04 -5.8440920e-03 3.5633869e-03\n", - " 4.9105873e-03 5.1725475e-04 2.1967487e-03 -6.0908021e-03\n", - " 7.0009519e-05 4.0739900e-03 -9.2197192e-04 -2.9479489e-03\n", - " 8.1171719e-03 -4.1095824e-03 -4.0333322e-03 -3.7203771e-03\n", - " -1.9784693e-03 1.2859865e-03 -1.9625863e-03 -3.6083779e-03\n", - " 7.2070572e-04 2.2963071e-03 -6.0072662e-03 -1.9689004e-03\n", - " -9.8250352e-04 1.6135543e-03 2.9561080e-03 -3.2419180e-03\n", - " -4.1539696e-04 1.0899190e-03 -3.4475133e-03], shape=(131,), dtype=float32)\n" - ] - } - ], - "source": [ - "print('Prediction for the 1st letter of the batch 1st sequense:')\n", - "print(example_batch_predictions[0, 0])" - ] - }, - { - "cell_type": "code", - "execution_count": 65, - "metadata": { - "colab": { - "base_uri": "https://localhost:8080/" - }, - "id": "ddczg32djMbI", - "outputId": "bdd397d7-8527-4317-ccb7-ab5fe88aae4b" - }, - "outputs": [ - { - "output_type": "execute_result", - "data": { - "text/plain": [ - "(2000,)" - ] - }, - "metadata": {}, - "execution_count": 65 - } - ], - "source": [ - "sampled_indices = tf.random.categorical(\n", - " logits=example_batch_predictions[0],\n", - " num_samples=1\n", - ")\n", - "\n", - "sampled_indices = tf.squeeze(\n", - " input=sampled_indices,\n", - " axis=-1\n", - ").numpy()\n", - "\n", - "sampled_indices.shape" - ] - }, - { - "cell_type": "code", - "execution_count": 66, - "metadata": { - "colab": { - "base_uri": "https://localhost:8080/" - }, - "id": "AE7k43_pjWFd", - "outputId": "9cf860e5-65ea-468e-ed6a-b681cef85975" - }, - "outputs": [ - { - "output_type": "stream", - "name": "stdout", - "text": [ - "Input:\n", - " '๐Ÿ“Œ T I T L E \\n \\n a m i s h b u t t e r m i l k c h e e s e c a k e \\n \\n ๐Ÿ‘€ D E S C R I P T I O'\n", - "\n", - "Next char prediction:\n", - " \"S h d ] รญ รด * 8 P รบ \\t l 8 ( รผ + โ€ ยฎ ` รข ' _ ยพ ๐Ÿ‘€ , 8 รง : รน p ' b ๐Ÿ‘€ i รง 3 รค g ( N m C n u ) ] รช ยฝ R รง\"\n" - ] - } - ], - "source": [ - "print('Input:\\n', repr(''.join(tokenizer.sequences_to_texts([input_example_batch[0].numpy()[:50]]))))\n", - "print()\n", - "print('Next char prediction:\\n', repr(''.join(tokenizer.sequences_to_texts([sampled_indices[:50]]))))" - ] - }, - { - "cell_type": "markdown", - "metadata": { - "id": "aJfFVowCjt0K" - }, - "source": [ - "### Train the model" - ] - }, - { - "cell_type": "code", - "execution_count": 65, - "metadata": { - "id": "eZjbE7lEnATL" - }, - "outputs": [], - "source": [ - "# Create a checkpoints directory.\n", - "checkpoint_dir = 'tmp/checkpoints'\n", - "os.makedirs(checkpoint_dir, exist_ok=True)\n", - "\n", - "checkpoint_prefix = os.path.join(checkpoint_dir, 'ckpt_{epoch}')\n", - "checkpoint_callback=tf.keras.callbacks.ModelCheckpoint(\n", - " filepath=checkpoint_prefix,\n", - " save_weights_only=True\n", - ")" - ] - }, - { - "cell_type": "code", - "execution_count": 64, - "metadata": { - "id": "pew6-J5nPxc-" - }, - "outputs": [], - "source": [ - "#!rm -d -r ./tmp" - ] - }, - { - "cell_type": "code", - "execution_count": 66, - "metadata": { - "colab": { - "base_uri": "https://localhost:8080/" - }, - "id": "Y7PXBFrpjanp", - "outputId": "49040547-45c0-420d-ee13-5b99821b11d8" - }, - "outputs": [ - { - "output_type": "stream", - "name": "stdout", - "text": [ - "Prediction shape: (64, 2000, 131) # (batch_size, sequence_length, vocab_size)\n", - "scalar_loss.shape: (64, 2000)\n", - "scalar_loss: 4.879728\n" - ] - } - ], - "source": [ - "def loss(labels, logits):\n", - " entropy = tf.keras.losses.sparse_categorical_crossentropy(\n", - " y_true=labels,\n", - " y_pred=logits,\n", - " from_logits=True\n", - " )\n", - " \n", - " return entropy\n", - "\n", - "example_batch_loss = loss(target_example_batch, example_batch_predictions)\n", - "\n", - "print(\"Prediction shape: \", example_batch_predictions.shape, \" # (batch_size, sequence_length, vocab_size)\")\n", - "print(\"scalar_loss.shape: \", example_batch_loss.shape)\n", - "print(\"scalar_loss: \", example_batch_loss.numpy().mean())" - ] - }, - { - "cell_type": "code", - "execution_count": 67, - "metadata": { - "id": "YezJHUItjy8B" - }, - "outputs": [], - "source": [ - "adam_optimizer = tf.keras.optimizers.Adam(learning_rate=0.002)\n", - "\n", - "model.compile(\n", - " optimizer=adam_optimizer,\n", - " loss=loss\n", - ")" - ] - }, - { - "cell_type": "code", - "execution_count": 68, - "metadata": { - "id": "uWc14jkSj6Gx" - }, - "outputs": [], - "source": [ - "early_stopping_callback = tf.keras.callbacks.EarlyStopping(\n", - " patience=5,\n", - " monitor='loss',\n", - " restore_best_weights=True,\n", - " verbose=1\n", - ")" - ] - }, - { - "cell_type": "code", - "execution_count": 69, - "metadata": { - "colab": { - "base_uri": "https://localhost:8080/" - }, - "id": "BlDCt5pMl7T_", - "outputId": "ff3606df-c49e-41b7-a238-2d89b9e4c8dd" - }, - "outputs": [ - { - "output_type": "stream", - "name": "stdout", - "text": [ - "EPOCHS: 200\n", - "INITIAL_EPOCH: 1\n", - "STEPS_PER_EPOCH: 25\n" - ] - } - ], - "source": [ - "EPOCHS = 200\n", - "INITIAL_EPOCH = 1\n", - "STEPS_PER_EPOCH = 25\n", - "\n", - "print('EPOCHS: ', EPOCHS)\n", - "print('INITIAL_EPOCH: ', INITIAL_EPOCH)\n", - "print('STEPS_PER_EPOCH: ', STEPS_PER_EPOCH)" - ] - }, - { - "cell_type": "code", - "execution_count": 70, - "metadata": { - "colab": { - "base_uri": "https://localhost:8080/" - }, - "id": "jbkRVdETmeOp", - "outputId": "cb479ab5-63ca-4dc3-e683-f66449e8ccdf" - }, - "outputs": [ - { - "output_type": "stream", - "name": "stdout", - "text": [ - "Epoch 2/200\n", - "25/25 [==============================] - 26s 952ms/step - loss: 2.6953\n", - "Epoch 3/200\n", - "25/25 [==============================] - 23s 940ms/step - loss: 1.6520\n", - "Epoch 4/200\n", - "25/25 [==============================] - 23s 940ms/step - loss: 1.4917\n", - "Epoch 5/200\n", - "25/25 [==============================] - 23s 939ms/step - loss: 1.3476\n", - "Epoch 6/200\n", - "25/25 [==============================] - 24s 941ms/step - loss: 1.1928\n", - "Epoch 7/200\n", - "25/25 [==============================] - 23s 940ms/step - loss: 1.0483\n", - "Epoch 8/200\n", - "25/25 [==============================] - 23s 940ms/step - loss: 1.0444\n", - "Epoch 9/200\n", - "25/25 [==============================] - 23s 940ms/step - loss: 0.9623\n", - "Epoch 10/200\n", - "25/25 [==============================] - 23s 940ms/step - loss: 0.9288\n", - "Epoch 11/200\n", - "25/25 [==============================] - 23s 940ms/step - loss: 0.8519\n", - "Epoch 12/200\n", - "25/25 [==============================] - 23s 940ms/step - loss: 0.7795\n", - "Epoch 13/200\n", - "25/25 [==============================] - 23s 940ms/step - loss: 0.7281\n", - "Epoch 14/200\n", - "25/25 [==============================] - 23s 940ms/step - loss: 0.6974\n", - "Epoch 15/200\n", - "25/25 [==============================] - 23s 940ms/step - loss: 0.6793\n", - "Epoch 16/200\n", - "25/25 [==============================] - 23s 940ms/step - loss: 0.6458\n", - "Epoch 17/200\n", - "25/25 [==============================] - 23s 938ms/step - loss: 0.6060\n", - "Epoch 18/200\n", - "25/25 [==============================] - 23s 940ms/step - loss: 0.5663\n", - "Epoch 19/200\n", - "25/25 [==============================] - 23s 938ms/step - loss: 0.5531\n", - "Epoch 20/200\n", - "25/25 [==============================] - 23s 939ms/step - loss: 0.5259\n", - "Epoch 21/200\n", - "25/25 [==============================] - 23s 939ms/step - loss: 0.5237\n", - "Epoch 22/200\n", - "25/25 [==============================] - 23s 939ms/step - loss: 0.5020\n", - "Epoch 23/200\n", - "25/25 [==============================] - 23s 939ms/step - loss: 0.4907\n", - "Epoch 24/200\n", - "25/25 [==============================] - 23s 937ms/step - loss: 0.4998\n", - "Epoch 25/200\n", - "25/25 [==============================] - 23s 938ms/step - loss: 0.4917\n", - "Epoch 26/200\n", - "25/25 [==============================] - 23s 938ms/step - loss: 0.4870\n", - "Epoch 27/200\n", - "25/25 [==============================] - 23s 938ms/step - loss: 0.4671\n", - "Epoch 28/200\n", - "25/25 [==============================] - 23s 939ms/step - loss: 0.4494\n", - "Epoch 29/200\n", - "25/25 [==============================] - 23s 939ms/step - loss: 0.4446\n", - "Epoch 30/200\n", - "25/25 [==============================] - 23s 939ms/step - loss: 0.4414\n", - "Epoch 31/200\n", - "25/25 [==============================] - 23s 938ms/step - loss: 0.4500\n", - "Epoch 32/200\n", - "25/25 [==============================] - 23s 938ms/step - loss: 0.4500\n", - "Epoch 33/200\n", - "25/25 [==============================] - 23s 936ms/step - loss: 0.4420\n", - "Epoch 34/200\n", - "25/25 [==============================] - 23s 939ms/step - loss: 0.4131\n", - "Epoch 35/200\n", - "25/25 [==============================] - 23s 939ms/step - loss: 0.4113\n", - "Epoch 36/200\n", - "25/25 [==============================] - 23s 939ms/step - loss: 0.4117\n", - "Epoch 37/200\n", - "25/25 [==============================] - 23s 938ms/step - loss: 0.4096\n", - "Epoch 38/200\n", - "25/25 [==============================] - 23s 939ms/step - loss: 0.3968\n", - "Epoch 39/200\n", - "25/25 [==============================] - 23s 938ms/step - loss: 0.4059\n", - "Epoch 40/200\n", - "25/25 [==============================] - 23s 937ms/step - loss: 0.4061\n", - "Epoch 41/200\n", - "25/25 [==============================] - 23s 939ms/step - loss: 0.4139\n", - "Epoch 42/200\n", - "25/25 [==============================] - 23s 937ms/step - loss: 0.4168\n", - "Epoch 43/200\n", - "25/25 [==============================] - ETA: 0s - loss: 0.4013Restoring model weights from the end of the best epoch: 38.\n", - "25/25 [==============================] - 23s 937ms/step - loss: 0.4013\n", - "Epoch 00043: early stopping\n" - ] - } - ], - "source": [ - "history = model.fit(\n", - " x=dataset_train,\n", - " epochs=EPOCHS,\n", - " steps_per_epoch=STEPS_PER_EPOCH,\n", - " initial_epoch=INITIAL_EPOCH,\n", - " callbacks=[\n", - " checkpoint_callback,\n", - " early_stopping_callback\n", - " ]\n", - ")\n", - "\n", - "# Saving the trained model to file (to be able to re-use it later).\n", - "model_name = 'recipe_generation_rnn_2.h5'\n", - "model.save(model_name, save_format='h5')" - ] - }, - { - "cell_type": "code", - "execution_count": 71, - "metadata": { - "colab": { - "base_uri": "https://localhost:8080/", - "height": 295 - }, - "id": "NjGhRmT0rzVj", - "outputId": "33470d00-c502-4cc6-9f2c-3ebed0a1d12d" - }, - "outputs": [ - { - "output_type": "display_data", - "data": { - "image/png": "\n", - "text/plain": [ - "
" - ] - }, - "metadata": { - "needs_background": "light" - } - } - ], - "source": [ - "def render_training_history(training_history):\n", - " loss = training_history.history['loss']\n", - "\n", - " plt.title('Loss')\n", - " plt.xlabel('Epoch')\n", - " plt.ylabel('Loss')\n", - " plt.plot(loss, label='Training set')\n", - " plt.legend()\n", - " plt.grid(linestyle='--', linewidth=1, alpha=0.5)\n", - " plt.show()\n", - "\n", - "render_training_history(history)" - ] - }, - { - "cell_type": "code", - "execution_count": 72, - "metadata": { - "colab": { - "base_uri": "https://localhost:8080/", - "height": 37 - }, - "id": "QulzmOYNsvih", - "outputId": "bb191383-ac48-4852-f4dc-f69837a8736c" - }, - "outputs": [ - { - "output_type": "execute_result", - "data": { - "application/vnd.google.colaboratory.intrinsic+json": { - "type": "string" - }, - "text/plain": [ - "'tmp/checkpoints/ckpt_43'" - ] - }, - "metadata": {}, - "execution_count": 72 - } - ], - "source": [ - "tf.train.latest_checkpoint(checkpoint_dir)" - ] - }, - { - "cell_type": "markdown", - "source": [ - "### Use simplified model with saved weights \n" - ], - "metadata": { - "id": "YsBulmpXx3IN" - } - }, - { - "cell_type": "code", - "execution_count": 67, - "metadata": { - "colab": { - "base_uri": "https://localhost:8080/" - }, - "id": "OIcrvYAKrtwm", - "outputId": "7806a3c9-90e7-448f-bfd9-bb027f14c39d" - }, - "outputs": [ - { - "output_type": "stream", - "name": "stdout", - "text": [ - "Model: \"sequential_1\"\n", - "_________________________________________________________________\n", - " Layer (type) Output Shape Param # \n", - "=================================================================\n", - " embedding_1 (Embedding) (1, None, 256) 33536 \n", - " \n", - " lstm_1 (LSTM) (1, None, 1024) 5246976 \n", - " \n", - " dense_1 (Dense) (1, None, 131) 134275 \n", - " \n", - "=================================================================\n", - "Total params: 5,414,787\n", - "Trainable params: 5,414,787\n", - "Non-trainable params: 0\n", - "_________________________________________________________________\n" - ] - } - ], - "source": [ - "simplified_batch_size = 1\n", - "vocab_size=VOCABULARY_SIZE\n", - "embedding_dim=256\n", - "rnn_units=1024\n", - "\n", - "model_simplified = build_model(vocab_size, embedding_dim, rnn_units, simplified_batch_size)\n", - "model_simplified.load_weights('/content/drive/MyDrive/data/ckpt')\n", - "model_simplified.build(tf.TensorShape([simplified_batch_size, None]))\n", - "\n", - "model_simplified.summary()" - ] - }, - { - "cell_type": "code", - "execution_count": 77, - "metadata": { - "id": "QPk5iRQ0m52x" - }, - "outputs": [], - "source": [ - "def generate_text(start_string, num_generate, temperature, model = model_simplified):\n", - " # Evaluation step (generating text using the learned model)\n", - " \n", - " padded_start_string = STOP_WORD_TITLE + start_string\n", - "\n", - " # Converting our start string to numbers (vectorizing).\n", - " input_indices = np.array(tokenizer.texts_to_sequences([padded_start_string]))\n", - "\n", - " # Empty string to store our results.\n", - " text_generated = []\n", - "\n", - " # Here batch size == 1.\n", - " model.reset_states()\n", - " for char_index in range(num_generate):\n", - " predictions = model(input_indices)\n", - " # remove the batch dimension\n", - " predictions = tf.squeeze(predictions, 0)\n", - "\n", - " # Using a categorical distribution to predict the character returned by the model.\n", - " predictions = predictions / temperature\n", - " predicted_id = tf.random.categorical(\n", - " predictions,\n", - " num_samples=1\n", - " )[-1, 0].numpy()\n", - "\n", - " # We pass the predicted character as the next input to the model\n", - " # along with the previous hidden state.\n", - " input_indices = tf.expand_dims([predicted_id], 0)\n", - " \n", - " next_character = tokenizer.sequences_to_texts(input_indices.numpy())[0]\n", - "\n", - " text_generated.append(next_character)\n", - "\n", - " return (padded_start_string + ''.join(text_generated))" - ] - }, - { - "cell_type": "code", - "execution_count": 78, - "metadata": { - "colab": { - "base_uri": "https://localhost:8080/" - }, - "id": "fZfhWVe6qirY", - "outputId": "3cd2024d-e986-4044-a6cb-72f94d5781a5" - }, - "outputs": [ - { - "output_type": "stream", - "name": "stdout", - "text": [ - "๐Ÿ“Œ TITLE\n", - "\n", - "chocolate chips\n", - "\n", - "๐Ÿ“ INSTRUCTIONS\n", - "\n", - "โ–ช๏ธŽ preheat oven to 350f\n", - "โ–ช๏ธŽ grease and flour a 9x5x3 inch pan\n", - "โ–ช๏ธŽ bake for 20 to 35 minutes or until toothpick inserted in center comes out clean\n", - "โ–ช๏ธŽ cool completely on a wire rack\n", - "โ–ช๏ธŽ cool completely on a wire rack\n", - "โ–ช๏ธŽ beat together the flour and baking soda\n", - "โ–ช๏ธŽ add the egg , and vanilla\n", - "โ–ช๏ธŽ beat until smooth\n", - "โ–ช๏ธŽ stir in flour mixture , beating well after each addition\n", - "โ–ช๏ธŽ stir in the coconut\n", - "โ–ช๏ธŽ pour into a greased and floured 9x5x2 pan\n", - "โ–ช๏ธŽ place one side of the pan , melt butter , sugar , butter , cream cheese , and vanilla\n", - "โ–ช๏ธŽ stir in the flour mixture and mix well\n", - "โ–ช๏ธŽ spread the cheesecake mixture over the two pans\n", - "โ–ช๏ธŽ bake for 15 minutes or until the top of the cake is cooled\n", - "โ–ช๏ธŽ cool in pan for 10 minutes\n", - "โ–ช๏ธŽ remove from the oven and cool completely on a wire rack\n", - "โ–ช๏ธŽ remove from pans and the cake batter in the center of the pan\n", - "โ–ช๏ธŽ bake for 12-12 minutes or until toothpick inserted in the center comes out clean\n", - "โ–ช๏ธŽ cool on a wire rack\n", - "โ–ช๏ธŽ cool on wire rack\n", - "โ–ช๏ธŽ cool on a wire\n" - ] - } - ], - "source": [ - "print(generate_text(start_string = 'chocolate', temperature= 0.4, num_generate= 1000))" - ] - }, - { - "cell_type": "markdown", - "source": [ - "### Create Demo" - ], - "metadata": { - "id": "a5SDwpIlyWrA" - } - }, - { - "cell_type": "code", - "source": [ - "!pip install gradio" - ], - "metadata": { - "colab": { - "base_uri": "https://localhost:8080/" - }, - "id": "xt5O806iyhHS", - "outputId": "1f659870-adf4-47f7-e89c-0a827c048845" - }, - "execution_count": 75, - "outputs": [ - { - "output_type": "stream", - "name": "stdout", - "text": [ - "Collecting gradio\n", - " Downloading gradio-2.7.0-py3-none-any.whl (865 kB)\n", - "\u001b[K |โ–ˆโ–ˆโ–ˆโ–ˆโ–ˆโ–ˆโ–ˆโ–ˆโ–ˆโ–ˆโ–ˆโ–ˆโ–ˆโ–ˆโ–ˆโ–ˆโ–ˆโ–ˆโ–ˆโ–ˆโ–ˆโ–ˆโ–ˆโ–ˆโ–ˆโ–ˆโ–ˆโ–ˆโ–ˆโ–ˆโ–ˆโ–ˆ| 865 kB 5.1 MB/s \n", - "\u001b[?25hCollecting analytics-python\n", - " Downloading analytics_python-1.4.0-py2.py3-none-any.whl (15 kB)\n", - "Collecting Flask-Cors>=3.0.8\n", - " Downloading Flask_Cors-3.0.10-py2.py3-none-any.whl (14 kB)\n", - "Collecting Flask-Login\n", - " Downloading Flask_Login-0.5.0-py2.py3-none-any.whl (16 kB)\n", - "Requirement already satisfied: numpy in /usr/local/lib/python3.7/dist-packages (from gradio) (1.19.5)\n", - "Requirement already satisfied: pandas in /usr/local/lib/python3.7/dist-packages (from gradio) (1.1.5)\n", - "Collecting markdown2\n", - " Downloading markdown2-2.4.2-py2.py3-none-any.whl (34 kB)\n", - "Collecting paramiko\n", - " Downloading paramiko-2.9.2-py2.py3-none-any.whl (210 kB)\n", - "\u001b[K |โ–ˆโ–ˆโ–ˆโ–ˆโ–ˆโ–ˆโ–ˆโ–ˆโ–ˆโ–ˆโ–ˆโ–ˆโ–ˆโ–ˆโ–ˆโ–ˆโ–ˆโ–ˆโ–ˆโ–ˆโ–ˆโ–ˆโ–ˆโ–ˆโ–ˆโ–ˆโ–ˆโ–ˆโ–ˆโ–ˆโ–ˆโ–ˆ| 210 kB 72.4 MB/s \n", - "\u001b[?25hCollecting flask-cachebuster\n", - " Downloading Flask-CacheBuster-1.0.0.tar.gz (3.1 kB)\n", - "Requirement already satisfied: requests in /usr/local/lib/python3.7/dist-packages (from gradio) (2.23.0)\n", - "Requirement already satisfied: Flask>=1.1.1 in /usr/local/lib/python3.7/dist-packages (from gradio) (1.1.4)\n", - "Requirement already satisfied: pillow in /usr/local/lib/python3.7/dist-packages (from gradio) (7.1.2)\n", - "Requirement already satisfied: matplotlib in /usr/local/lib/python3.7/dist-packages (from gradio) (3.2.2)\n", - "Collecting pydub\n", - " Downloading pydub-0.25.1-py2.py3-none-any.whl (32 kB)\n", - "Collecting pycryptodome\n", - " Downloading pycryptodome-3.12.0-cp35-abi3-manylinux2010_x86_64.whl (2.0 MB)\n", - "\u001b[K |โ–ˆโ–ˆโ–ˆโ–ˆโ–ˆโ–ˆโ–ˆโ–ˆโ–ˆโ–ˆโ–ˆโ–ˆโ–ˆโ–ˆโ–ˆโ–ˆโ–ˆโ–ˆโ–ˆโ–ˆโ–ˆโ–ˆโ–ˆโ–ˆโ–ˆโ–ˆโ–ˆโ–ˆโ–ˆโ–ˆโ–ˆโ–ˆ| 2.0 MB 56.1 MB/s \n", - "\u001b[?25hCollecting ffmpy\n", - " Downloading ffmpy-0.3.0.tar.gz (4.8 kB)\n", - "Requirement already satisfied: itsdangerous<2.0,>=0.24 in /usr/local/lib/python3.7/dist-packages (from Flask>=1.1.1->gradio) (1.1.0)\n", - "Requirement already satisfied: Werkzeug<2.0,>=0.15 in /usr/local/lib/python3.7/dist-packages (from Flask>=1.1.1->gradio) (1.0.1)\n", - "Requirement already satisfied: Jinja2<3.0,>=2.10.1 in /usr/local/lib/python3.7/dist-packages (from Flask>=1.1.1->gradio) (2.11.3)\n", - "Requirement already satisfied: click<8.0,>=5.1 in /usr/local/lib/python3.7/dist-packages (from Flask>=1.1.1->gradio) (7.1.2)\n", - "Requirement already satisfied: Six in /usr/local/lib/python3.7/dist-packages (from Flask-Cors>=3.0.8->gradio) (1.15.0)\n", - "Requirement already satisfied: MarkupSafe>=0.23 in /usr/local/lib/python3.7/dist-packages (from Jinja2<3.0,>=2.10.1->Flask>=1.1.1->gradio) (2.0.1)\n", - "Collecting backoff==1.10.0\n", - " Downloading backoff-1.10.0-py2.py3-none-any.whl (31 kB)\n", - "Requirement already satisfied: python-dateutil>2.1 in /usr/local/lib/python3.7/dist-packages (from analytics-python->gradio) (2.8.2)\n", - "Collecting monotonic>=1.5\n", - " Downloading monotonic-1.6-py2.py3-none-any.whl (8.2 kB)\n", - "Requirement already satisfied: certifi>=2017.4.17 in /usr/local/lib/python3.7/dist-packages (from requests->gradio) (2021.10.8)\n", - "Requirement already satisfied: urllib3!=1.25.0,!=1.25.1,<1.26,>=1.21.1 in /usr/local/lib/python3.7/dist-packages (from requests->gradio) (1.24.3)\n", - "Requirement already satisfied: chardet<4,>=3.0.2 in /usr/local/lib/python3.7/dist-packages (from requests->gradio) (3.0.4)\n", - "Requirement already satisfied: idna<3,>=2.5 in /usr/local/lib/python3.7/dist-packages (from requests->gradio) (2.10)\n", - "Requirement already satisfied: pyparsing!=2.0.4,!=2.1.2,!=2.1.6,>=2.0.1 in /usr/local/lib/python3.7/dist-packages (from matplotlib->gradio) (3.0.6)\n", - "Requirement already satisfied: kiwisolver>=1.0.1 in /usr/local/lib/python3.7/dist-packages (from matplotlib->gradio) (1.3.2)\n", - "Requirement already satisfied: cycler>=0.10 in /usr/local/lib/python3.7/dist-packages (from matplotlib->gradio) (0.11.0)\n", - "Requirement already satisfied: pytz>=2017.2 in /usr/local/lib/python3.7/dist-packages (from pandas->gradio) (2018.9)\n", - "Collecting pynacl>=1.0.1\n", - " Downloading PyNaCl-1.5.0-cp36-abi3-manylinux_2_17_x86_64.manylinux2014_x86_64.manylinux_2_24_x86_64.whl (856 kB)\n", - "\u001b[K |โ–ˆโ–ˆโ–ˆโ–ˆโ–ˆโ–ˆโ–ˆโ–ˆโ–ˆโ–ˆโ–ˆโ–ˆโ–ˆโ–ˆโ–ˆโ–ˆโ–ˆโ–ˆโ–ˆโ–ˆโ–ˆโ–ˆโ–ˆโ–ˆโ–ˆโ–ˆโ–ˆโ–ˆโ–ˆโ–ˆโ–ˆโ–ˆ| 856 kB 62.8 MB/s \n", - "\u001b[?25hCollecting cryptography>=2.5\n", - " Downloading cryptography-36.0.1-cp36-abi3-manylinux_2_24_x86_64.whl (3.6 MB)\n", - "\u001b[K |โ–ˆโ–ˆโ–ˆโ–ˆโ–ˆโ–ˆโ–ˆโ–ˆโ–ˆโ–ˆโ–ˆโ–ˆโ–ˆโ–ˆโ–ˆโ–ˆโ–ˆโ–ˆโ–ˆโ–ˆโ–ˆโ–ˆโ–ˆโ–ˆโ–ˆโ–ˆโ–ˆโ–ˆโ–ˆโ–ˆโ–ˆโ–ˆ| 3.6 MB 54.3 MB/s \n", - "\u001b[?25hCollecting bcrypt>=3.1.3\n", - " Downloading bcrypt-3.2.0-cp36-abi3-manylinux_2_17_x86_64.manylinux2014_x86_64.manylinux_2_24_x86_64.whl (61 kB)\n", - "\u001b[K |โ–ˆโ–ˆโ–ˆโ–ˆโ–ˆโ–ˆโ–ˆโ–ˆโ–ˆโ–ˆโ–ˆโ–ˆโ–ˆโ–ˆโ–ˆโ–ˆโ–ˆโ–ˆโ–ˆโ–ˆโ–ˆโ–ˆโ–ˆโ–ˆโ–ˆโ–ˆโ–ˆโ–ˆโ–ˆโ–ˆโ–ˆโ–ˆ| 61 kB 475 kB/s \n", - "\u001b[?25hRequirement already satisfied: cffi>=1.1 in /usr/local/lib/python3.7/dist-packages (from bcrypt>=3.1.3->paramiko->gradio) (1.15.0)\n", - "Requirement already satisfied: pycparser in /usr/local/lib/python3.7/dist-packages (from cffi>=1.1->bcrypt>=3.1.3->paramiko->gradio) (2.21)\n", - "Building wheels for collected packages: ffmpy, flask-cachebuster\n", - " Building wheel for ffmpy (setup.py) ... \u001b[?25l\u001b[?25hdone\n", - " Created wheel for ffmpy: filename=ffmpy-0.3.0-py3-none-any.whl size=4710 sha256=caed95e7ed9255709f5e2072514271aec09a19050876e761d4e50974d4e0feac\n", - " Stored in directory: /root/.cache/pip/wheels/13/e4/6c/e8059816e86796a597c6e6b0d4c880630f51a1fcfa0befd5e6\n", - " Building wheel for flask-cachebuster (setup.py) ... \u001b[?25l\u001b[?25hdone\n", - " Created wheel for flask-cachebuster: filename=Flask_CacheBuster-1.0.0-py3-none-any.whl size=3371 sha256=eeaff1e0d9760425d30c7fdf969de62af0790435797b9ba60e86a52871d0a2a6\n", - " Stored in directory: /root/.cache/pip/wheels/28/c0/c4/44687421dab41455be93112bd1b0dee1f3c5a9aa27bee63708\n", - "Successfully built ffmpy flask-cachebuster\n", - "Installing collected packages: pynacl, monotonic, cryptography, bcrypt, backoff, pydub, pycryptodome, paramiko, markdown2, Flask-Login, Flask-Cors, flask-cachebuster, ffmpy, analytics-python, gradio\n", - "Successfully installed Flask-Cors-3.0.10 Flask-Login-0.5.0 analytics-python-1.4.0 backoff-1.10.0 bcrypt-3.2.0 cryptography-36.0.1 ffmpy-0.3.0 flask-cachebuster-1.0.0 gradio-2.7.0 markdown2-2.4.2 monotonic-1.6 paramiko-2.9.2 pycryptodome-3.12.0 pydub-0.25.1 pynacl-1.5.0\n" - ] - } - ] - }, - { - "cell_type": "code", - "execution_count": 79, - "metadata": { - "id": "32BbT3Pcq6FD", - "colab": { - "base_uri": "https://localhost:8080/", - "height": 640 - }, - "outputId": "deb157ad-09f2-4410-9e18-b8657f1f81c0" - }, - "outputs": [ - { - "output_type": "stream", - "name": "stdout", - "text": [ - "Colab notebook detected. To show errors in colab notebook, set `debug=True` in `launch()`\n", - "Running on public URL: https://53578.gradio.app\n", - "\n", - "This share link expires in 72 hours. For free permanent hosting, check out Spaces (https://huggingface.co/spaces)\n" - ] - }, - { - "output_type": "display_data", - "data": { - "text/html": [ - "\n", - " \n", - " " - ], - "text/plain": [ - "" - ] - }, - "metadata": {} - }, - { - "output_type": "execute_result", - "data": { - "text/plain": [ - "(,\n", - " 'http://127.0.0.1:7861/',\n", - " 'https://53578.gradio.app')" - ] - }, - "metadata": {}, - "execution_count": 79 - } - ], - "source": [ - "import gradio as gr\n", - "\n", - "\n", - "iface = gr.Interface(\n", - " fn=generate_text, \n", - " inputs=[\"text\", gr.inputs.Slider(0, 1000), gr.inputs.Slider(0, 1)],\n", - " outputs=[\"text\"])\n", - "iface.launch()\n", - "\n", - "\n" - ] - }, - { - "cell_type": "code", - "source": [ - "" - ], - "metadata": { - "id": "jnRRMamQ_rVE" - }, - "execution_count": null, - "outputs": [] - } - ], - "metadata": { - "accelerator": "GPU", - "colab": { - "collapsed_sections": [ - "YzYu-XRFJrhk", - "AHnTupAX1I-a", - "YQ6KkwnW0_7k", - "O7P0Y2_vKe7S", - "53DFAGpH_Ukx", - "oVZ1fmE-Yxm-", - "FLF8-MNJSQfh", - "Ca9lz-qSVj8I", - "wnhPp7Ovc4Lm", - "iCToSc7GiliO" - ], - "name": "baking-project.ipynb", - "provenance": [] - }, - "kernelspec": { - "display_name": "Python 3", - "name": "python3" - }, - "language_info": { - "name": "python" - } - }, - "nbformat": 4, - "nbformat_minor": 0 -} \ No newline at end of file