{ "cells": [ { "cell_type": "markdown", "metadata": {}, "source": [ "**Chapter 13 – Loading and Preprocessing Data with TensorFlow**\n", "\n", "_This notebook contains all the sample code and solutions to the exercises in chapter 13._" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "\n", " \n", "
\n", " Run in Google Colab\n", "
" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "# Setup" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "First, let's import a few common modules, ensure MatplotLib plots figures inline and prepare a function to save the figures. We also check that Python 3.5 or later is installed (although Python 2.x may work, it is deprecated so we strongly recommend you use Python 3 instead), as well as Scikit-Learn ≥0.20 and TensorFlow ≥2.0." ] }, { "cell_type": "code", "execution_count": 1, "metadata": {}, "outputs": [], "source": [ "# Python ≥3.5 is required\n", "import sys\n", "assert sys.version_info >= (3, 5)\n", "\n", "# Scikit-Learn ≥0.20 is required\n", "import sklearn\n", "assert sklearn.__version__ >= \"0.20\"\n", "\n", "try:\n", " # %tensorflow_version only exists in Colab.\n", " %tensorflow_version 2.x\n", " !pip install -q -U tfx==0.15.0rc0\n", " print(\"You can safely ignore the package incompatibility errors.\")\n", "except Exception:\n", " pass\n", "\n", "# TensorFlow ≥2.0 is required\n", "import tensorflow as tf\n", "from tensorflow import keras\n", "assert tf.__version__ >= \"2.0\"\n", "\n", "# Common imports\n", "import numpy as np\n", "import os\n", "\n", "# to make this notebook's output stable across runs\n", "np.random.seed(42)\n", "\n", "# To plot pretty figures\n", "%matplotlib inline\n", "import matplotlib as mpl\n", "import matplotlib.pyplot as plt\n", "mpl.rc('axes', labelsize=14)\n", "mpl.rc('xtick', labelsize=12)\n", "mpl.rc('ytick', labelsize=12)\n", "\n", "# Where to save the figures\n", "PROJECT_ROOT_DIR = \".\"\n", "CHAPTER_ID = \"data\"\n", "IMAGES_PATH = os.path.join(PROJECT_ROOT_DIR, \"images\", CHAPTER_ID)\n", "os.makedirs(IMAGES_PATH, exist_ok=True)\n", "\n", "def save_fig(fig_id, tight_layout=True, fig_extension=\"png\", resolution=300):\n", " path = os.path.join(IMAGES_PATH, fig_id + \".\" + fig_extension)\n", " print(\"Saving figure\", fig_id)\n", " if tight_layout:\n", " plt.tight_layout()\n", " plt.savefig(path, format=fig_extension, dpi=resolution)" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "## Datasets" ] }, { "cell_type": "code", "execution_count": 2, "metadata": {}, "outputs": [ { "data": { "text/plain": [ "" ] }, "execution_count": 2, "metadata": {}, "output_type": "execute_result" } ], "source": [ "X = tf.range(10)\n", "dataset = tf.data.Dataset.from_tensor_slices(X)\n", "dataset" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "Equivalently:" ] }, { "cell_type": "code", "execution_count": 3, "metadata": {}, "outputs": [], "source": [ "dataset = tf.data.Dataset.range(10)" ] }, { "cell_type": "code", "execution_count": 4, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "tf.Tensor(0, shape=(), dtype=int64)\n", "tf.Tensor(1, shape=(), dtype=int64)\n", "tf.Tensor(2, shape=(), dtype=int64)\n", "tf.Tensor(3, shape=(), dtype=int64)\n", "tf.Tensor(4, shape=(), dtype=int64)\n", "tf.Tensor(5, shape=(), dtype=int64)\n", "tf.Tensor(6, shape=(), dtype=int64)\n", "tf.Tensor(7, shape=(), dtype=int64)\n", "tf.Tensor(8, shape=(), dtype=int64)\n", "tf.Tensor(9, shape=(), dtype=int64)\n" ] } ], "source": [ "for item in dataset:\n", " print(item)" ] }, { "cell_type": "code", "execution_count": 5, "metadata": { "tags": [ "raises-exception" ] }, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "tf.Tensor([0 1 2 3 4 5 6], shape=(7,), dtype=int64)\n", "tf.Tensor([7 8 9 0 1 2 3], shape=(7,), dtype=int64)\n", "tf.Tensor([4 5 6 7 8 9 0], shape=(7,), dtype=int64)\n", "tf.Tensor([1 2 3 4 5 6 7], shape=(7,), dtype=int64)\n", "tf.Tensor([8 9], shape=(2,), dtype=int64)\n" ] } ], "source": [ "dataset = dataset.repeat(3).batch(7)\n", "for item in dataset:\n", " print(item)" ] }, { "cell_type": "code", "execution_count": 6, "metadata": {}, "outputs": [], "source": [ "dataset = dataset.map(lambda x: x * 2)" ] }, { "cell_type": "code", "execution_count": 7, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "tf.Tensor([ 0 2 4 6 8 10 12], shape=(7,), dtype=int64)\n", "tf.Tensor([14 16 18 0 2 4 6], shape=(7,), dtype=int64)\n", "tf.Tensor([ 8 10 12 14 16 18 0], shape=(7,), dtype=int64)\n", "tf.Tensor([ 2 4 6 8 10 12 14], shape=(7,), dtype=int64)\n", "tf.Tensor([16 18], shape=(2,), dtype=int64)\n" ] } ], "source": [ "for item in dataset:\n", " print(item)" ] }, { "cell_type": "code", "execution_count": 8, "metadata": {}, "outputs": [], "source": [ "dataset = dataset.apply(tf.data.experimental.unbatch())" ] }, { "cell_type": "code", "execution_count": 9, "metadata": {}, "outputs": [], "source": [ "dataset = dataset.filter(lambda x: x < 10) # keep only items < 10" ] }, { "cell_type": "code", "execution_count": 10, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "tf.Tensor(0, shape=(), dtype=int64)\n", "tf.Tensor(2, shape=(), dtype=int64)\n", "tf.Tensor(4, shape=(), dtype=int64)\n" ] } ], "source": [ "for item in dataset.take(3):\n", " print(item)" ] }, { "cell_type": "code", "execution_count": 11, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "tf.Tensor([0 3 4 2 1 5 8], shape=(7,), dtype=int64)\n", "tf.Tensor([6 9 7 2 3 1 4], shape=(7,), dtype=int64)\n", "tf.Tensor([6 0 7 9 0 1 2], shape=(7,), dtype=int64)\n", "tf.Tensor([8 4 5 5 3 8 9], shape=(7,), dtype=int64)\n", "tf.Tensor([7 6], shape=(2,), dtype=int64)\n" ] } ], "source": [ "dataset = tf.data.Dataset.range(10).repeat(3)\n", "dataset = dataset.shuffle(buffer_size=3, seed=42).batch(7)\n", "for item in dataset:\n", " print(item)" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "## Split the California dataset to multiple CSV files" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "Let's start by loading and preparing the California housing dataset. We first load it, then split it into a training set, a validation set and a test set, and finally we scale it:" ] }, { "cell_type": "code", "execution_count": 12, "metadata": {}, "outputs": [], "source": [ "from sklearn.datasets import fetch_california_housing\n", "from sklearn.model_selection import train_test_split\n", "from sklearn.preprocessing import StandardScaler\n", "\n", "housing = fetch_california_housing()\n", "X_train_full, X_test, y_train_full, y_test = train_test_split(\n", " housing.data, housing.target.reshape(-1, 1), random_state=42)\n", "X_train, X_valid, y_train, y_valid = train_test_split(\n", " X_train_full, y_train_full, random_state=42)\n", "\n", "scaler = StandardScaler()\n", "scaler.fit(X_train)\n", "X_mean = scaler.mean_\n", "X_std = scaler.scale_" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "For a very large dataset that does not fit in memory, you will typically want to split it into many files first, then have TensorFlow read these files in parallel. To demonstrate this, let's start by splitting the housing dataset and save it to 20 CSV files:" ] }, { "cell_type": "code", "execution_count": 13, "metadata": {}, "outputs": [], "source": [ "def save_to_multiple_csv_files(data, name_prefix, header=None, n_parts=10):\n", " housing_dir = os.path.join(\"datasets\", \"housing\")\n", " os.makedirs(housing_dir, exist_ok=True)\n", " path_format = os.path.join(housing_dir, \"my_{}_{:02d}.csv\")\n", "\n", " filepaths = []\n", " m = len(data)\n", " for file_idx, row_indices in enumerate(np.array_split(np.arange(m), n_parts)):\n", " part_csv = path_format.format(name_prefix, file_idx)\n", " filepaths.append(part_csv)\n", " with open(part_csv, \"wt\", encoding=\"utf-8\") as f:\n", " if header is not None:\n", " f.write(header)\n", " f.write(\"\\n\")\n", " for row_idx in row_indices:\n", " f.write(\",\".join([repr(col) for col in data[row_idx]]))\n", " f.write(\"\\n\")\n", " return filepaths" ] }, { "cell_type": "code", "execution_count": 14, "metadata": {}, "outputs": [], "source": [ "train_data = np.c_[X_train, y_train]\n", "valid_data = np.c_[X_valid, y_valid]\n", "test_data = np.c_[X_test, y_test]\n", "header_cols = housing.feature_names + [\"MedianHouseValue\"]\n", "header = \",\".join(header_cols)\n", "\n", "train_filepaths = save_to_multiple_csv_files(train_data, \"train\", header, n_parts=20)\n", "valid_filepaths = save_to_multiple_csv_files(valid_data, \"valid\", header, n_parts=10)\n", "test_filepaths = save_to_multiple_csv_files(test_data, \"test\", header, n_parts=10)" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "Okay, now let's take a peek at the first few lines of one of these CSV files:" ] }, { "cell_type": "code", "execution_count": 15, "metadata": {}, "outputs": [ { "data": { "text/html": [ "
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
MedIncHouseAgeAveRoomsAveBedrmsPopulationAveOccupLatitudeLongitudeMedianHouseValue
03.521415.03.0499451.1065481447.01.60599337.63-122.431.442
15.32755.06.4900600.9910543464.03.44334033.69-117.391.687
23.100029.07.5423731.5915251328.02.25084738.44-122.981.621
37.173612.06.2890030.9974421054.02.69565233.55-117.702.621
42.054913.05.3124571.0850923297.02.24438433.93-116.930.956
\n", "
" ], "text/plain": [ " MedInc HouseAge AveRooms AveBedrms Population AveOccup Latitude \\\n", "0 3.5214 15.0 3.049945 1.106548 1447.0 1.605993 37.63 \n", "1 5.3275 5.0 6.490060 0.991054 3464.0 3.443340 33.69 \n", "2 3.1000 29.0 7.542373 1.591525 1328.0 2.250847 38.44 \n", "3 7.1736 12.0 6.289003 0.997442 1054.0 2.695652 33.55 \n", "4 2.0549 13.0 5.312457 1.085092 3297.0 2.244384 33.93 \n", "\n", " Longitude MedianHouseValue \n", "0 -122.43 1.442 \n", "1 -117.39 1.687 \n", "2 -122.98 1.621 \n", "3 -117.70 2.621 \n", "4 -116.93 0.956 " ] }, "execution_count": 15, "metadata": {}, "output_type": "execute_result" } ], "source": [ "import pandas as pd\n", "\n", "pd.read_csv(train_filepaths[0]).head()" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "Or in text mode:" ] }, { "cell_type": "code", "execution_count": 16, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "MedInc,HouseAge,AveRooms,AveBedrms,Population,AveOccup,Latitude,Longitude,MedianHouseValue\n", "3.5214,15.0,3.0499445061043287,1.106548279689234,1447.0,1.6059933407325193,37.63,-122.43,1.442\n", "5.3275,5.0,6.490059642147117,0.9910536779324056,3464.0,3.4433399602385686,33.69,-117.39,1.687\n", "3.1,29.0,7.5423728813559325,1.5915254237288134,1328.0,2.2508474576271187,38.44,-122.98,1.621\n", "7.1736,12.0,6.289002557544757,0.9974424552429667,1054.0,2.6956521739130435,33.55,-117.7,2.621\n" ] } ], "source": [ "with open(train_filepaths[0]) as f:\n", " for i in range(5):\n", " print(f.readline(), end=\"\")" ] }, { "cell_type": "code", "execution_count": 17, "metadata": {}, "outputs": [ { "data": { "text/plain": [ "['datasets/housing/my_train_00.csv',\n", " 'datasets/housing/my_train_01.csv',\n", " 'datasets/housing/my_train_02.csv',\n", " 'datasets/housing/my_train_03.csv',\n", " 'datasets/housing/my_train_04.csv',\n", " 'datasets/housing/my_train_05.csv',\n", " 'datasets/housing/my_train_06.csv',\n", " 'datasets/housing/my_train_07.csv',\n", " 'datasets/housing/my_train_08.csv',\n", " 'datasets/housing/my_train_09.csv',\n", " 'datasets/housing/my_train_10.csv',\n", " 'datasets/housing/my_train_11.csv',\n", " 'datasets/housing/my_train_12.csv',\n", " 'datasets/housing/my_train_13.csv',\n", " 'datasets/housing/my_train_14.csv',\n", " 'datasets/housing/my_train_15.csv',\n", " 'datasets/housing/my_train_16.csv',\n", " 'datasets/housing/my_train_17.csv',\n", " 'datasets/housing/my_train_18.csv',\n", " 'datasets/housing/my_train_19.csv']" ] }, "execution_count": 17, "metadata": {}, "output_type": "execute_result" } ], "source": [ "train_filepaths" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "## Building an Input Pipeline" ] }, { "cell_type": "code", "execution_count": 18, "metadata": {}, "outputs": [], "source": [ "filepath_dataset = tf.data.Dataset.list_files(train_filepaths, seed=42)" ] }, { "cell_type": "code", "execution_count": 19, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "tf.Tensor(b'datasets/housing/my_train_05.csv', shape=(), dtype=string)\n", "tf.Tensor(b'datasets/housing/my_train_16.csv', shape=(), dtype=string)\n", "tf.Tensor(b'datasets/housing/my_train_01.csv', shape=(), dtype=string)\n", "tf.Tensor(b'datasets/housing/my_train_17.csv', shape=(), dtype=string)\n", "tf.Tensor(b'datasets/housing/my_train_00.csv', shape=(), dtype=string)\n", "tf.Tensor(b'datasets/housing/my_train_14.csv', shape=(), dtype=string)\n", "tf.Tensor(b'datasets/housing/my_train_10.csv', shape=(), dtype=string)\n", "tf.Tensor(b'datasets/housing/my_train_02.csv', shape=(), dtype=string)\n", "tf.Tensor(b'datasets/housing/my_train_12.csv', shape=(), dtype=string)\n", "tf.Tensor(b'datasets/housing/my_train_19.csv', shape=(), dtype=string)\n", "tf.Tensor(b'datasets/housing/my_train_07.csv', shape=(), dtype=string)\n", "tf.Tensor(b'datasets/housing/my_train_09.csv', shape=(), dtype=string)\n", "tf.Tensor(b'datasets/housing/my_train_13.csv', shape=(), dtype=string)\n", "tf.Tensor(b'datasets/housing/my_train_15.csv', shape=(), dtype=string)\n", "tf.Tensor(b'datasets/housing/my_train_11.csv', shape=(), dtype=string)\n", "tf.Tensor(b'datasets/housing/my_train_18.csv', shape=(), dtype=string)\n", "tf.Tensor(b'datasets/housing/my_train_04.csv', shape=(), dtype=string)\n", "tf.Tensor(b'datasets/housing/my_train_06.csv', shape=(), dtype=string)\n", "tf.Tensor(b'datasets/housing/my_train_03.csv', shape=(), dtype=string)\n", "tf.Tensor(b'datasets/housing/my_train_08.csv', shape=(), dtype=string)\n" ] } ], "source": [ "for filepath in filepath_dataset:\n", " print(filepath)" ] }, { "cell_type": "code", "execution_count": 20, "metadata": {}, "outputs": [], "source": [ "n_readers = 5\n", "dataset = filepath_dataset.interleave(\n", " lambda filepath: tf.data.TextLineDataset(filepath).skip(1),\n", " cycle_length=n_readers)" ] }, { "cell_type": "code", "execution_count": 21, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "b'4.2083,44.0,5.323204419889502,0.9171270718232044,846.0,2.3370165745856353,37.47,-122.2,2.782'\n", "b'4.1812,52.0,5.701388888888889,0.9965277777777778,692.0,2.4027777777777777,33.73,-118.31,3.215'\n", "b'3.6875,44.0,4.524475524475524,0.993006993006993,457.0,3.195804195804196,34.04,-118.15,1.625'\n", "b'3.3456,37.0,4.514084507042254,0.9084507042253521,458.0,3.2253521126760565,36.67,-121.7,2.526'\n", "b'3.5214,15.0,3.0499445061043287,1.106548279689234,1447.0,1.6059933407325193,37.63,-122.43,1.442'\n" ] } ], "source": [ "for line in dataset.take(5):\n", " print(line.numpy())" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "Notice that field 4 is interpreted as a string." ] }, { "cell_type": "code", "execution_count": 22, "metadata": {}, "outputs": [ { "data": { "text/plain": [ "[,\n", " ,\n", " ,\n", " ,\n", " ]" ] }, "execution_count": 22, "metadata": {}, "output_type": "execute_result" } ], "source": [ "record_defaults=[0, np.nan, tf.constant(np.nan, dtype=tf.float64), \"Hello\", tf.constant([])]\n", "parsed_fields = tf.io.decode_csv('1,2,3,4,5', record_defaults)\n", "parsed_fields" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "Notice that all missing fields are replaced with their default value, when provided:" ] }, { "cell_type": "code", "execution_count": 23, "metadata": {}, "outputs": [ { "data": { "text/plain": [ "[,\n", " ,\n", " ,\n", " ,\n", " ]" ] }, "execution_count": 23, "metadata": {}, "output_type": "execute_result" } ], "source": [ "parsed_fields = tf.io.decode_csv(',,,,5', record_defaults)\n", "parsed_fields" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "The 5th field is compulsory (since we provided `tf.constant([])` as the \"default value\"), so we get an exception if we do not provide it:" ] }, { "cell_type": "code", "execution_count": 24, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "Field 4 is required but missing in record 0! [Op:DecodeCSV]\n" ] } ], "source": [ "try:\n", " parsed_fields = tf.io.decode_csv(',,,,', record_defaults)\n", "except tf.errors.InvalidArgumentError as ex:\n", " print(ex)" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "The number of fields should match exactly the number of fields in the `record_defaults`:" ] }, { "cell_type": "code", "execution_count": 25, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "Expect 5 fields but have 7 in record 0 [Op:DecodeCSV]\n" ] } ], "source": [ "try:\n", " parsed_fields = tf.io.decode_csv('1,2,3,4,5,6,7', record_defaults)\n", "except tf.errors.InvalidArgumentError as ex:\n", " print(ex)" ] }, { "cell_type": "code", "execution_count": 26, "metadata": { "scrolled": false }, "outputs": [], "source": [ "n_inputs = 8 # X_train.shape[-1]\n", "\n", "@tf.function\n", "def preprocess(line):\n", " defs = [0.] * n_inputs + [tf.constant([], dtype=tf.float32)]\n", " fields = tf.io.decode_csv(line, record_defaults=defs)\n", " x = tf.stack(fields[:-1])\n", " y = tf.stack(fields[-1:])\n", " return (x - X_mean) / X_std, y" ] }, { "cell_type": "code", "execution_count": 27, "metadata": {}, "outputs": [ { "data": { "text/plain": [ "(,\n", " )" ] }, "execution_count": 27, "metadata": {}, "output_type": "execute_result" } ], "source": [ "preprocess(b'4.2083,44.0,5.3232,0.9171,846.0,2.3370,37.47,-122.2,2.782')" ] }, { "cell_type": "code", "execution_count": 28, "metadata": {}, "outputs": [], "source": [ "def csv_reader_dataset(filepaths, repeat=1, n_readers=5,\n", " n_read_threads=None, shuffle_buffer_size=10000,\n", " n_parse_threads=5, batch_size=32):\n", " dataset = tf.data.Dataset.list_files(filepaths).repeat(repeat)\n", " dataset = dataset.interleave(\n", " lambda filepath: tf.data.TextLineDataset(filepath).skip(1),\n", " cycle_length=n_readers, num_parallel_calls=n_read_threads)\n", " dataset = dataset.shuffle(shuffle_buffer_size)\n", " dataset = dataset.map(preprocess, num_parallel_calls=n_parse_threads)\n", " dataset = dataset.batch(batch_size)\n", " return dataset.prefetch(1)" ] }, { "cell_type": "code", "execution_count": 29, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "X = tf.Tensor(\n", "[[-1.1309323 0.5834586 -1.1141092 -0.0205977 0.4861637 0.05622157\n", " -0.72447133 0.63188833]\n", " [ 1.9003258 -0.998705 0.6331733 -0.03226789 0.05084941 -0.28879014\n", " 1.0229574 -1.2872185 ]\n", " [-0.08566543 -1.6315705 0.04572354 -0.16350564 0.18226504 0.09547261\n", " -0.92123175 0.65187943]], shape=(3, 8), dtype=float32)\n", "y = tf.Tensor(\n", "[[2.219]\n", " [3.702]\n", " [2.875]], shape=(3, 1), dtype=float32)\n", "\n", "X = tf.Tensor(\n", "[[ 0.92681414 -1.0778131 0.194918 -0.2745752 -0.02215928 0.13869788\n", " 1.2525111 -1.5520943 ]\n", " [ 0.12441459 -2.0271113 -0.0920579 -0.17962678 -0.38263968 0.11107364\n", " -1.3007004 1.9912548 ]\n", " [-0.10782045 1.8491895 -0.01604682 0.04678809 -0.33609664 -0.2658364\n", " 0.9807942 -1.4471437 ]], shape=(3, 8), dtype=float32)\n", "y = tf.Tensor(\n", "[[1.724]\n", " [0.832]\n", " [3.118]], shape=(3, 1), dtype=float32)\n", "\n" ] } ], "source": [ "train_set = csv_reader_dataset(train_filepaths, batch_size=3)\n", "for X_batch, y_batch in train_set.take(2):\n", " print(\"X =\", X_batch)\n", " print(\"y =\", y_batch)\n", " print()" ] }, { "cell_type": "code", "execution_count": 30, "metadata": {}, "outputs": [], "source": [ "train_set = csv_reader_dataset(train_filepaths, repeat=None)\n", "valid_set = csv_reader_dataset(valid_filepaths)\n", "test_set = csv_reader_dataset(test_filepaths)" ] }, { "cell_type": "code", "execution_count": 31, "metadata": {}, "outputs": [], "source": [ "model = keras.models.Sequential([\n", " keras.layers.Dense(30, activation=\"relu\", input_shape=X_train.shape[1:]),\n", " keras.layers.Dense(1),\n", "])" ] }, { "cell_type": "code", "execution_count": 32, "metadata": {}, "outputs": [], "source": [ "model.compile(loss=\"mse\", optimizer=keras.optimizers.SGD(lr=1e-3))" ] }, { "cell_type": "code", "execution_count": 33, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "Epoch 1/10\n", "362/362 [==============================] - 1s 3ms/step - loss: 2.4158 - val_loss: 0.9218\n", "Epoch 2/10\n", "362/362 [==============================] - 1s 2ms/step - loss: 0.7571 - val_loss: 0.6992\n", "Epoch 3/10\n", "362/362 [==============================] - 1s 2ms/step - loss: 0.6767 - val_loss: 0.6532\n", "Epoch 4/10\n", "362/362 [==============================] - 1s 2ms/step - loss: 0.6406 - val_loss: 0.6751\n", "Epoch 5/10\n", "362/362 [==============================] - 1s 2ms/step - loss: 0.6012 - val_loss: 0.5904\n", "Epoch 6/10\n", "362/362 [==============================] - 1s 2ms/step - loss: 0.5792 - val_loss: 0.6392\n", "Epoch 7/10\n", "362/362 [==============================] - 1s 2ms/step - loss: 0.5531 - val_loss: 0.5686\n", "Epoch 8/10\n", "362/362 [==============================] - 1s 2ms/step - loss: 0.5050 - val_loss: 0.5086\n", "Epoch 9/10\n", "362/362 [==============================] - 1s 2ms/step - loss: 0.5039 - val_loss: 0.4825\n", "Epoch 10/10\n", "362/362 [==============================] - 1s 2ms/step - loss: 0.4930 - val_loss: 0.4761\n" ] }, { "data": { "text/plain": [ "" ] }, "execution_count": 33, "metadata": {}, "output_type": "execute_result" } ], "source": [ "batch_size = 32\n", "model.fit(train_set, steps_per_epoch=len(X_train) // batch_size, epochs=10,\n", " validation_data=valid_set)" ] }, { "cell_type": "code", "execution_count": 34, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "161/161 [==============================] - 0s 1ms/step - loss: 0.4804\n" ] }, { "data": { "text/plain": [ "0.4803512151937307" ] }, "execution_count": 34, "metadata": {}, "output_type": "execute_result" } ], "source": [ "model.evaluate(test_set, steps=len(X_test) // batch_size)" ] }, { "cell_type": "code", "execution_count": 35, "metadata": { "scrolled": true }, "outputs": [ { "data": { "text/plain": [ "array([[2.5751495],\n", " [2.1568842],\n", " [2.019339 ],\n", " ...,\n", " [3.553556 ],\n", " [2.2041245],\n", " [2.366016 ]], dtype=float32)" ] }, "execution_count": 35, "metadata": {}, "output_type": "execute_result" } ], "source": [ "new_set = test_set.map(lambda X, y: X) # we could instead just pass test_set, Keras would ignore the labels\n", "X_new = X_test\n", "model.predict(new_set, steps=len(X_new) // batch_size)" ] }, { "cell_type": "code", "execution_count": 36, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "Global step 1810/1810" ] } ], "source": [ "optimizer = keras.optimizers.Nadam(lr=0.01)\n", "loss_fn = keras.losses.mean_squared_error\n", "\n", "n_epochs = 5\n", "batch_size = 32\n", "n_steps_per_epoch = len(X_train) // batch_size\n", "total_steps = n_epochs * n_steps_per_epoch\n", "global_step = 0\n", "for X_batch, y_batch in train_set.take(total_steps):\n", " global_step += 1\n", " print(\"\\rGlobal step {}/{}\".format(global_step, total_steps), end=\"\")\n", " with tf.GradientTape() as tape:\n", " y_pred = model(X_batch)\n", " main_loss = tf.reduce_mean(loss_fn(y_batch, y_pred))\n", " loss = tf.add_n([main_loss] + model.losses)\n", " gradients = tape.gradient(loss, model.trainable_variables)\n", " optimizer.apply_gradients(zip(gradients, model.trainable_variables))" ] }, { "cell_type": "code", "execution_count": 37, "metadata": {}, "outputs": [], "source": [ "optimizer = keras.optimizers.Nadam(lr=0.01)\n", "loss_fn = keras.losses.mean_squared_error\n", "\n", "@tf.function\n", "def train(model, n_epochs, batch_size=32,\n", " n_readers=5, n_read_threads=5, shuffle_buffer_size=10000, n_parse_threads=5):\n", " train_set = csv_reader_dataset(train_filepaths, repeat=n_epochs, n_readers=n_readers,\n", " n_read_threads=n_read_threads, shuffle_buffer_size=shuffle_buffer_size,\n", " n_parse_threads=n_parse_threads, batch_size=batch_size)\n", " for X_batch, y_batch in train_set:\n", " with tf.GradientTape() as tape:\n", " y_pred = model(X_batch)\n", " main_loss = tf.reduce_mean(loss_fn(y_batch, y_pred))\n", " loss = tf.add_n([main_loss] + model.losses)\n", " gradients = tape.gradient(loss, model.trainable_variables)\n", " optimizer.apply_gradients(zip(gradients, model.trainable_variables))\n", "\n", "train(model, 5)" ] }, { "cell_type": "code", "execution_count": 38, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "Global step 100 / 1810\n", "Global step 200 / 1810\n", "Global step 300 / 1810\n", "Global step 400 / 1810\n", "Global step 500 / 1810\n", "Global step 600 / 1810\n", "Global step 700 / 1810\n", "Global step 800 / 1810\n", "Global step 900 / 1810\n", "Global step 1000 / 1810\n", "Global step 1100 / 1810\n", "Global step 1200 / 1810\n", "Global step 1300 / 1810\n", "Global step 1400 / 1810\n", "Global step 1500 / 1810\n", "Global step 1600 / 1810\n", "Global step 1700 / 1810\n", "Global step 1800 / 1810\n" ] } ], "source": [ "optimizer = keras.optimizers.Nadam(lr=0.01)\n", "loss_fn = keras.losses.mean_squared_error\n", "\n", "@tf.function\n", "def train(model, n_epochs, batch_size=32,\n", " n_readers=5, n_read_threads=5, shuffle_buffer_size=10000, n_parse_threads=5):\n", " train_set = csv_reader_dataset(train_filepaths, repeat=n_epochs, n_readers=n_readers,\n", " n_read_threads=n_read_threads, shuffle_buffer_size=shuffle_buffer_size,\n", " n_parse_threads=n_parse_threads, batch_size=batch_size)\n", " n_steps_per_epoch = len(X_train) // batch_size\n", " total_steps = n_epochs * n_steps_per_epoch\n", " global_step = 0\n", " for X_batch, y_batch in train_set.take(total_steps):\n", " global_step += 1\n", " if tf.equal(global_step % 100, 0):\n", " tf.print(\"\\rGlobal step\", global_step, \"/\", total_steps)\n", " with tf.GradientTape() as tape:\n", " y_pred = model(X_batch)\n", " main_loss = tf.reduce_mean(loss_fn(y_batch, y_pred))\n", " loss = tf.add_n([main_loss] + model.losses)\n", " gradients = tape.gradient(loss, model.trainable_variables)\n", " optimizer.apply_gradients(zip(gradients, model.trainable_variables))\n", "\n", "train(model, 5)" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "Here is a short description of each method in the `Dataset` class:" ] }, { "cell_type": "code", "execution_count": 39, "metadata": { "scrolled": true }, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "● apply() Applies a transformation function to this dataset.\n", "● batch() Combines consecutive elements of this dataset into batches.\n", "● cache() Caches the elements in this dataset.\n", "● concatenate() Creates a `Dataset` by concatenating given dataset with this dataset.\n", "● filter() Filters this dataset according to `predicate`.\n", "● flat_map() Maps `map_func` across this dataset and flattens the result.\n", "● from_generator() Creates a `Dataset` whose elements are generated by `generator`.\n", "● from_tensor_slices() Creates a `Dataset` whose elements are slices of the given tensors.\n", "● from_tensors() Creates a `Dataset` with a single element, comprising the given tensors.\n", "● interleave() Maps `map_func` across this dataset, and interleaves the results.\n", "● list_files() A dataset of all files matching one or more glob patterns.\n", "● map() Maps `map_func` across the elements of this dataset.\n", "● options() Returns the options for this dataset and its inputs.\n", "● padded_batch() Combines consecutive elements of this dataset into padded batches.\n", "● prefetch() Creates a `Dataset` that prefetches elements from this dataset.\n", "● range() Creates a `Dataset` of a step-separated range of values.\n", "● reduce() Reduces the input dataset to a single element.\n", "● repeat() Repeats this dataset `count` times.\n", "● shard() Creates a `Dataset` that includes only 1/`num_shards` of this dataset.\n", "● shuffle() Randomly shuffles the elements of this dataset.\n", "● skip() Creates a `Dataset` that skips `count` elements from this dataset.\n", "● take() Creates a `Dataset` with at most `count` elements from this dataset.\n", "● window() Combines input elements into a dataset of windows.\n", "● with_options() Returns a new `tf.data.Dataset` with the given options set.\n", "● zip() Creates a `Dataset` by zipping together the given datasets.\n" ] } ], "source": [ "for m in dir(tf.data.Dataset):\n", " if not (m.startswith(\"_\") or m.endswith(\"_\")):\n", " func = getattr(tf.data.Dataset, m)\n", " if hasattr(func, \"__doc__\"):\n", " print(\"● {:21s}{}\".format(m + \"()\", func.__doc__.split(\"\\n\")[0]))" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "## The `TFRecord` binary format" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "A TFRecord file is just a list of binary records. You can create one using a `tf.io.TFRecordWriter`:" ] }, { "cell_type": "code", "execution_count": 40, "metadata": {}, "outputs": [], "source": [ "with tf.io.TFRecordWriter(\"my_data.tfrecord\") as f:\n", " f.write(b\"This is the first record\")\n", " f.write(b\"And this is the second record\")" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "And you can read it using a `tf.data.TFRecordDataset`:" ] }, { "cell_type": "code", "execution_count": 41, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "tf.Tensor(b'This is the first record', shape=(), dtype=string)\n", "tf.Tensor(b'And this is the second record', shape=(), dtype=string)\n" ] } ], "source": [ "filepaths = [\"my_data.tfrecord\"]\n", "dataset = tf.data.TFRecordDataset(filepaths)\n", "for item in dataset:\n", " print(item)" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "You can read multiple TFRecord files with just one `TFRecordDataset`. By default it will read them one at a time, but if you set `num_parallel_reads=3`, it will read 3 at a time in parallel and interleave their records:" ] }, { "cell_type": "code", "execution_count": 42, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "tf.Tensor(b'File 0 record 0', shape=(), dtype=string)\n", "tf.Tensor(b'File 1 record 0', shape=(), dtype=string)\n", "tf.Tensor(b'File 2 record 0', shape=(), dtype=string)\n", "tf.Tensor(b'File 0 record 1', shape=(), dtype=string)\n", "tf.Tensor(b'File 1 record 1', shape=(), dtype=string)\n", "tf.Tensor(b'File 2 record 1', shape=(), dtype=string)\n", "tf.Tensor(b'File 0 record 2', shape=(), dtype=string)\n", "tf.Tensor(b'File 1 record 2', shape=(), dtype=string)\n", "tf.Tensor(b'File 2 record 2', shape=(), dtype=string)\n", "tf.Tensor(b'File 3 record 0', shape=(), dtype=string)\n", "tf.Tensor(b'File 4 record 0', shape=(), dtype=string)\n", "tf.Tensor(b'File 3 record 1', shape=(), dtype=string)\n", "tf.Tensor(b'File 4 record 1', shape=(), dtype=string)\n", "tf.Tensor(b'File 3 record 2', shape=(), dtype=string)\n", "tf.Tensor(b'File 4 record 2', shape=(), dtype=string)\n" ] } ], "source": [ "filepaths = [\"my_test_{}.tfrecord\".format(i) for i in range(5)]\n", "for i, filepath in enumerate(filepaths):\n", " with tf.io.TFRecordWriter(filepath) as f:\n", " for j in range(3):\n", " f.write(\"File {} record {}\".format(i, j).encode(\"utf-8\"))\n", "\n", "dataset = tf.data.TFRecordDataset(filepaths, num_parallel_reads=3)\n", "for item in dataset:\n", " print(item)" ] }, { "cell_type": "code", "execution_count": 43, "metadata": {}, "outputs": [], "source": [ "options = tf.io.TFRecordOptions(compression_type=\"GZIP\")\n", "with tf.io.TFRecordWriter(\"my_compressed.tfrecord\", options) as f:\n", " f.write(b\"This is the first record\")\n", " f.write(b\"And this is the second record\")" ] }, { "cell_type": "code", "execution_count": 44, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "tf.Tensor(b'This is the first record', shape=(), dtype=string)\n", "tf.Tensor(b'And this is the second record', shape=(), dtype=string)\n" ] } ], "source": [ "dataset = tf.data.TFRecordDataset([\"my_compressed.tfrecord\"],\n", " compression_type=\"GZIP\")\n", "for item in dataset:\n", " print(item)" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "### A Brief Intro to Protocol Buffers" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "For this section you need to [install protobuf](https://developers.google.com/protocol-buffers/docs/downloads). In general you will not have to do so when using TensorFlow, as it comes with functions to create and parse protocol buffers of type `tf.train.Example`, which are generally sufficient. However, in this section we will learn about protocol buffers by creating our own simple protobuf definition, so we need the protobuf compiler (`protoc`): we will use it to compile the protobuf definition to a Python module that we can then use in our code." ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "First let's write a simple protobuf definition:" ] }, { "cell_type": "code", "execution_count": 45, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "Overwriting person.proto\n" ] } ], "source": [ "%%writefile person.proto\n", "syntax = \"proto3\";\n", "message Person {\n", " string name = 1;\n", " int32 id = 2;\n", " repeated string email = 3;\n", "}" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "And let's compile it (the `--descriptor_set_out` and `--include_imports` options are only required for the `tf.io.decode_proto()` example below):" ] }, { "cell_type": "code", "execution_count": 46, "metadata": {}, "outputs": [], "source": [ "!protoc person.proto --python_out=. --descriptor_set_out=person.desc --include_imports" ] }, { "cell_type": "code", "execution_count": 47, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "person.desc person.proto person_pb2.py\r\n" ] } ], "source": [ "!ls person*" ] }, { "cell_type": "code", "execution_count": 48, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "name: \"Al\"\n", "id: 123\n", "email: \"a@b.com\"\n", "\n" ] } ], "source": [ "from person_pb2 import Person\n", "\n", "person = Person(name=\"Al\", id=123, email=[\"a@b.com\"]) # create a Person\n", "print(person) # display the Person" ] }, { "cell_type": "code", "execution_count": 49, "metadata": {}, "outputs": [ { "data": { "text/plain": [ "'Al'" ] }, "execution_count": 49, "metadata": {}, "output_type": "execute_result" } ], "source": [ "person.name # read a field" ] }, { "cell_type": "code", "execution_count": 50, "metadata": {}, "outputs": [], "source": [ "person.name = \"Alice\" # modify a field" ] }, { "cell_type": "code", "execution_count": 51, "metadata": {}, "outputs": [ { "data": { "text/plain": [ "'a@b.com'" ] }, "execution_count": 51, "metadata": {}, "output_type": "execute_result" } ], "source": [ "person.email[0] # repeated fields can be accessed like arrays" ] }, { "cell_type": "code", "execution_count": 52, "metadata": {}, "outputs": [], "source": [ "person.email.append(\"c@d.com\") # add an email address" ] }, { "cell_type": "code", "execution_count": 53, "metadata": {}, "outputs": [ { "data": { "text/plain": [ "b'\\n\\x05Alice\\x10{\\x1a\\x07a@b.com\\x1a\\x07c@d.com'" ] }, "execution_count": 53, "metadata": {}, "output_type": "execute_result" } ], "source": [ "s = person.SerializeToString() # serialize to a byte string\n", "s" ] }, { "cell_type": "code", "execution_count": 54, "metadata": {}, "outputs": [ { "data": { "text/plain": [ "27" ] }, "execution_count": 54, "metadata": {}, "output_type": "execute_result" } ], "source": [ "person2 = Person() # create a new Person\n", "person2.ParseFromString(s) # parse the byte string (27 bytes)" ] }, { "cell_type": "code", "execution_count": 55, "metadata": {}, "outputs": [ { "data": { "text/plain": [ "True" ] }, "execution_count": 55, "metadata": {}, "output_type": "execute_result" } ], "source": [ "person == person2 # now they are equal" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "#### Custom protobuf" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "In rare cases, you may want to parse a custom protobuf (like the one we just created) in TensorFlow. For this you can use the `tf.io.decode_proto()` function:" ] }, { "cell_type": "code", "execution_count": 56, "metadata": {}, "outputs": [ { "data": { "text/plain": [ "[,\n", " ,\n", " ]" ] }, "execution_count": 56, "metadata": {}, "output_type": "execute_result" } ], "source": [ "person_tf = tf.io.decode_proto(\n", " bytes=s,\n", " message_type=\"Person\",\n", " field_names=[\"name\", \"id\", \"email\"],\n", " output_types=[tf.string, tf.int32, tf.string],\n", " descriptor_source=\"person.desc\")\n", "\n", "person_tf.values" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "For more details, see the [`tf.io.decode_proto()`](https://www.tensorflow.org/api_docs/python/tf/io/decode_proto) documentation." ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "### TensorFlow Protobufs" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "Here is the definition of the tf.train.Example protobuf:" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "```proto\n", "syntax = \"proto3\";\n", "\n", "message BytesList { repeated bytes value = 1; }\n", "message FloatList { repeated float value = 1 [packed = true]; }\n", "message Int64List { repeated int64 value = 1 [packed = true]; }\n", "message Feature {\n", " oneof kind {\n", " BytesList bytes_list = 1;\n", " FloatList float_list = 2;\n", " Int64List int64_list = 3;\n", " }\n", "};\n", "message Features { map feature = 1; };\n", "message Example { Features features = 1; };\n", "```" ] }, { "cell_type": "code", "execution_count": 57, "metadata": {}, "outputs": [], "source": [ "# WARNING: there's currently a bug preventing \"from tensorflow.train import X\"\n", "# so we work around it by writing \"X = tf.train.X\"\n", "#from tensorflow.train import BytesList, FloatList, Int64List\n", "#from tensorflow.train import Feature, Features, Example\n", "BytesList = tf.train.BytesList\n", "FloatList = tf.train.FloatList\n", "Int64List = tf.train.Int64List\n", "Feature = tf.train.Feature\n", "Features = tf.train.Features\n", "Example = tf.train.Example\n", "\n", "person_example = Example(\n", " features=Features(\n", " feature={\n", " \"name\": Feature(bytes_list=BytesList(value=[b\"Alice\"])),\n", " \"id\": Feature(int64_list=Int64List(value=[123])),\n", " \"emails\": Feature(bytes_list=BytesList(value=[b\"a@b.com\", b\"c@d.com\"]))\n", " }))\n", "\n", "with tf.io.TFRecordWriter(\"my_contacts.tfrecord\") as f:\n", " f.write(person_example.SerializeToString())" ] }, { "cell_type": "code", "execution_count": 58, "metadata": {}, "outputs": [], "source": [ "feature_description = {\n", " \"name\": tf.io.FixedLenFeature([], tf.string, default_value=\"\"),\n", " \"id\": tf.io.FixedLenFeature([], tf.int64, default_value=0),\n", " \"emails\": tf.io.VarLenFeature(tf.string),\n", "}\n", "for serialized_example in tf.data.TFRecordDataset([\"my_contacts.tfrecord\"]):\n", " parsed_example = tf.io.parse_single_example(serialized_example,\n", " feature_description)" ] }, { "cell_type": "code", "execution_count": 59, "metadata": {}, "outputs": [ { "data": { "text/plain": [ "{'emails': ,\n", " 'id': ,\n", " 'name': }" ] }, "execution_count": 59, "metadata": {}, "output_type": "execute_result" } ], "source": [ "parsed_example" ] }, { "cell_type": "code", "execution_count": 60, "metadata": { "scrolled": true }, "outputs": [ { "data": { "text/plain": [ "{'emails': ,\n", " 'id': ,\n", " 'name': }" ] }, "execution_count": 60, "metadata": {}, "output_type": "execute_result" } ], "source": [ "parsed_example" ] }, { "cell_type": "code", "execution_count": 61, "metadata": {}, "outputs": [ { "data": { "text/plain": [ "" ] }, "execution_count": 61, "metadata": {}, "output_type": "execute_result" } ], "source": [ "parsed_example[\"emails\"].values[0]" ] }, { "cell_type": "code", "execution_count": 62, "metadata": {}, "outputs": [ { "data": { "text/plain": [ "" ] }, "execution_count": 62, "metadata": {}, "output_type": "execute_result" } ], "source": [ "tf.sparse.to_dense(parsed_example[\"emails\"], default_value=b\"\")" ] }, { "cell_type": "code", "execution_count": 63, "metadata": {}, "outputs": [ { "data": { "text/plain": [ "" ] }, "execution_count": 63, "metadata": {}, "output_type": "execute_result" } ], "source": [ "parsed_example[\"emails\"].values" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "### Putting Images in TFRecords" ] }, { "cell_type": "code", "execution_count": 64, "metadata": {}, "outputs": [ { "data": { "image/png": "\n", "text/plain": [ "
" ] }, "metadata": { "needs_background": "light" }, "output_type": "display_data" } ], "source": [ "from sklearn.datasets import load_sample_images\n", "\n", "img = load_sample_images()[\"images\"][0]\n", "plt.imshow(img)\n", "plt.axis(\"off\")\n", "plt.title(\"Original Image\")\n", "plt.show()" ] }, { "cell_type": "code", "execution_count": 65, "metadata": {}, "outputs": [], "source": [ "data = tf.io.encode_jpeg(img)\n", "example_with_image = Example(features=Features(feature={\n", " \"image\": Feature(bytes_list=BytesList(value=[data.numpy()]))}))\n", "serialized_example = example_with_image.SerializeToString()\n", "# then save to TFRecord" ] }, { "cell_type": "code", "execution_count": 66, "metadata": {}, "outputs": [], "source": [ "feature_description = { \"image\": tf.io.VarLenFeature(tf.string) }\n", "example_with_image = tf.io.parse_single_example(serialized_example, feature_description)\n", "decoded_img = tf.io.decode_jpeg(example_with_image[\"image\"].values[0])" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "Or use `decode_image()` which supports BMP, GIF, JPEG and PNG formats:" ] }, { "cell_type": "code", "execution_count": 67, "metadata": {}, "outputs": [], "source": [ "decoded_img = tf.io.decode_image(example_with_image[\"image\"].values[0])" ] }, { "cell_type": "code", "execution_count": 68, "metadata": {}, "outputs": [ { "data": { "image/png": "\n", "text/plain": [ "
" ] }, "metadata": { "needs_background": "light" }, "output_type": "display_data" } ], "source": [ "plt.imshow(decoded_img)\n", "plt.title(\"Decoded Image\")\n", "plt.axis(\"off\")\n", "plt.show()" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "### Putting Tensors and Sparse Tensors in TFRecords" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "Tensors can be serialized and parsed easily using `tf.io.serialize_tensor()` and `tf.io.parse_tensor()`:" ] }, { "cell_type": "code", "execution_count": 69, "metadata": {}, "outputs": [ { "data": { "text/plain": [ "" ] }, "execution_count": 69, "metadata": {}, "output_type": "execute_result" } ], "source": [ "t = tf.constant([[0., 1.], [2., 3.], [4., 5.]])\n", "s = tf.io.serialize_tensor(t)\n", "s" ] }, { "cell_type": "code", "execution_count": 70, "metadata": {}, "outputs": [ { "data": { "text/plain": [ "" ] }, "execution_count": 70, "metadata": {}, "output_type": "execute_result" } ], "source": [ "tf.io.parse_tensor(s, out_type=tf.float32)" ] }, { "cell_type": "code", "execution_count": 71, "metadata": {}, "outputs": [ { "data": { "text/plain": [ "" ] }, "execution_count": 71, "metadata": {}, "output_type": "execute_result" } ], "source": [ "serialized_sparse = tf.io.serialize_sparse(parsed_example[\"emails\"])\n", "serialized_sparse" ] }, { "cell_type": "code", "execution_count": 72, "metadata": {}, "outputs": [ { "data": { "text/plain": [ "value: \"\\010\\t\\022\\010\\022\\002\\010\\002\\022\\002\\010\\001\\\"\\020\\000\\000\\000\\000\\000\\000\\000\\000\\001\\000\\000\\000\\000\\000\\000\\000\"\n", "value: \"\\010\\007\\022\\004\\022\\002\\010\\002\\\"\\020\\007\\007a@b.comc@d.com\"\n", "value: \"\\010\\t\\022\\004\\022\\002\\010\\001\\\"\\010\\002\\000\\000\\000\\000\\000\\000\\000\"" ] }, "execution_count": 72, "metadata": {}, "output_type": "execute_result" } ], "source": [ "BytesList(value=serialized_sparse.numpy())" ] }, { "cell_type": "code", "execution_count": 73, "metadata": {}, "outputs": [], "source": [ "dataset = tf.data.TFRecordDataset([\"my_contacts.tfrecord\"]).batch(10)\n", "for serialized_examples in dataset:\n", " parsed_examples = tf.io.parse_example(serialized_examples,\n", " feature_description)" ] }, { "cell_type": "code", "execution_count": 74, "metadata": {}, "outputs": [ { "data": { "text/plain": [ "{'image': }" ] }, "execution_count": 74, "metadata": {}, "output_type": "execute_result" } ], "source": [ "parsed_examples" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "## Handling Sequential Data Using `SequenceExample`" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "```proto\n", "syntax = \"proto3\";\n", "\n", "message FeatureList { repeated Feature feature = 1; };\n", "message FeatureLists { map feature_list = 1; };\n", "message SequenceExample {\n", " Features context = 1;\n", " FeatureLists feature_lists = 2;\n", "};\n", "```" ] }, { "cell_type": "code", "execution_count": 75, "metadata": {}, "outputs": [], "source": [ "# WARNING: there's currently a bug preventing \"from tensorflow.train import X\"\n", "# so we work around it by writing \"X = tf.train.X\"\n", "#from tensorflow.train import FeatureList, FeatureLists, SequenceExample\n", "FeatureList = tf.train.FeatureList\n", "FeatureLists = tf.train.FeatureLists\n", "SequenceExample = tf.train.SequenceExample\n", "\n", "context = Features(feature={\n", " \"author_id\": Feature(int64_list=Int64List(value=[123])),\n", " \"title\": Feature(bytes_list=BytesList(value=[b\"A\", b\"desert\", b\"place\", b\".\"])),\n", " \"pub_date\": Feature(int64_list=Int64List(value=[1623, 12, 25]))\n", "})\n", "\n", "content = [[\"When\", \"shall\", \"we\", \"three\", \"meet\", \"again\", \"?\"],\n", " [\"In\", \"thunder\", \",\", \"lightning\", \",\", \"or\", \"in\", \"rain\", \"?\"]]\n", "comments = [[\"When\", \"the\", \"hurlyburly\", \"'s\", \"done\", \".\"],\n", " [\"When\", \"the\", \"battle\", \"'s\", \"lost\", \"and\", \"won\", \".\"]]\n", "\n", "def words_to_feature(words):\n", " return Feature(bytes_list=BytesList(value=[word.encode(\"utf-8\")\n", " for word in words]))\n", "\n", "content_features = [words_to_feature(sentence) for sentence in content]\n", "comments_features = [words_to_feature(comment) for comment in comments]\n", " \n", "sequence_example = SequenceExample(\n", " context=context,\n", " feature_lists=FeatureLists(feature_list={\n", " \"content\": FeatureList(feature=content_features),\n", " \"comments\": FeatureList(feature=comments_features)\n", " }))" ] }, { "cell_type": "code", "execution_count": 76, "metadata": {}, "outputs": [ { "data": { "text/plain": [ "context {\n", " feature {\n", " key: \"author_id\"\n", " value {\n", " int64_list {\n", " value: 123\n", " }\n", " }\n", " }\n", " feature {\n", " key: \"pub_date\"\n", " value {\n", " int64_list {\n", " value: 1623\n", " value: 12\n", " value: 25\n", " }\n", " }\n", " }\n", " feature {\n", " key: \"title\"\n", " value {\n", " bytes_list {\n", " value: \"A\"\n", " value: \"desert\"\n", " value: \"place\"\n", " value: \".\"\n", " }\n", " }\n", " }\n", "}\n", "feature_lists {\n", " feature_list {\n", " key: \"comments\"\n", " value {\n", " feature {\n", " bytes_list {\n", " value: \"When\"\n", " value: \"the\"\n", " value: \"hurlyburly\"\n", " value: \"\\'s\"\n", " value: \"done\"\n", " value: \".\"\n", " }\n", " }\n", " feature {\n", " bytes_list {\n", " value: \"When\"\n", " value: \"the\"\n", " value: \"battle\"\n", " value: \"\\'s\"\n", " value: \"lost\"\n", " value: \"and\"\n", " value: \"won\"\n", " value: \".\"\n", " }\n", " }\n", " }\n", " }\n", " feature_list {\n", " key: \"content\"\n", " value {\n", " feature {\n", " bytes_list {\n", " value: \"When\"\n", " value: \"shall\"\n", " value: \"we\"\n", " value: \"three\"\n", " value: \"meet\"\n", " value: \"again\"\n", " value: \"?\"\n", " }\n", " }\n", " feature {\n", " bytes_list {\n", " value: \"In\"\n", " value: \"thunder\"\n", " value: \",\"\n", " value: \"lightning\"\n", " value: \",\"\n", " value: \"or\"\n", " value: \"in\"\n", " value: \"rain\"\n", " value: \"?\"\n", " }\n", " }\n", " }\n", " }\n", "}" ] }, "execution_count": 76, "metadata": {}, "output_type": "execute_result" } ], "source": [ "sequence_example" ] }, { "cell_type": "code", "execution_count": 77, "metadata": {}, "outputs": [], "source": [ "serialized_sequence_example = sequence_example.SerializeToString()" ] }, { "cell_type": "code", "execution_count": 78, "metadata": {}, "outputs": [], "source": [ "context_feature_descriptions = {\n", " \"author_id\": tf.io.FixedLenFeature([], tf.int64, default_value=0),\n", " \"title\": tf.io.VarLenFeature(tf.string),\n", " \"pub_date\": tf.io.FixedLenFeature([3], tf.int64, default_value=[0, 0, 0]),\n", "}\n", "sequence_feature_descriptions = {\n", " \"content\": tf.io.VarLenFeature(tf.string),\n", " \"comments\": tf.io.VarLenFeature(tf.string),\n", "}\n", "parsed_context, parsed_feature_lists = tf.io.parse_single_sequence_example(\n", " serialized_sequence_example, context_feature_descriptions,\n", " sequence_feature_descriptions)" ] }, { "cell_type": "code", "execution_count": 79, "metadata": {}, "outputs": [ { "data": { "text/plain": [ "{'title': ,\n", " 'author_id': ,\n", " 'pub_date': }" ] }, "execution_count": 79, "metadata": {}, "output_type": "execute_result" } ], "source": [ "parsed_context" ] }, { "cell_type": "code", "execution_count": 80, "metadata": {}, "outputs": [ { "data": { "text/plain": [ "" ] }, "execution_count": 80, "metadata": {}, "output_type": "execute_result" } ], "source": [ "parsed_context[\"title\"].values" ] }, { "cell_type": "code", "execution_count": 81, "metadata": {}, "outputs": [ { "data": { "text/plain": [ "{'comments': ,\n", " 'content': }" ] }, "execution_count": 81, "metadata": {}, "output_type": "execute_result" } ], "source": [ "parsed_feature_lists" ] }, { "cell_type": "code", "execution_count": 82, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "\n" ] } ], "source": [ "print(tf.RaggedTensor.from_sparse(parsed_feature_lists[\"content\"]))" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "# The Features API" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "Let's use the variant of the California housing dataset that we used in Chapter 2, since it contains categorical features and missing values:" ] }, { "cell_type": "code", "execution_count": 83, "metadata": {}, "outputs": [], "source": [ "import os\n", "import tarfile\n", "import urllib\n", "\n", "DOWNLOAD_ROOT = \"https://raw.githubusercontent.com/ageron/handson-ml2/master/\"\n", "HOUSING_PATH = os.path.join(\"datasets\", \"housing\")\n", "HOUSING_URL = DOWNLOAD_ROOT + \"datasets/housing/housing.tgz\"\n", "\n", "def fetch_housing_data(housing_url=HOUSING_URL, housing_path=HOUSING_PATH):\n", " os.makedirs(housing_path, exist_ok=True)\n", " tgz_path = os.path.join(housing_path, \"housing.tgz\")\n", " urllib.request.urlretrieve(housing_url, tgz_path)\n", " housing_tgz = tarfile.open(tgz_path)\n", " housing_tgz.extractall(path=housing_path)\n", " housing_tgz.close()" ] }, { "cell_type": "code", "execution_count": 84, "metadata": {}, "outputs": [], "source": [ "fetch_housing_data()" ] }, { "cell_type": "code", "execution_count": 85, "metadata": {}, "outputs": [], "source": [ "import pandas as pd\n", "\n", "def load_housing_data(housing_path=HOUSING_PATH):\n", " csv_path = os.path.join(housing_path, \"housing.csv\")\n", " return pd.read_csv(csv_path)" ] }, { "cell_type": "code", "execution_count": 86, "metadata": {}, "outputs": [ { "data": { "text/html": [ "
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
longitudelatitudehousing_median_agetotal_roomstotal_bedroomspopulationhouseholdsmedian_incomemedian_house_valueocean_proximity
0-122.2337.8841.0880.0129.0322.0126.08.3252452600.0NEAR BAY
1-122.2237.8621.07099.01106.02401.01138.08.3014358500.0NEAR BAY
2-122.2437.8552.01467.0190.0496.0177.07.2574352100.0NEAR BAY
3-122.2537.8552.01274.0235.0558.0219.05.6431341300.0NEAR BAY
4-122.2537.8552.01627.0280.0565.0259.03.8462342200.0NEAR BAY
\n", "
" ], "text/plain": [ " longitude latitude housing_median_age total_rooms total_bedrooms \\\n", "0 -122.23 37.88 41.0 880.0 129.0 \n", "1 -122.22 37.86 21.0 7099.0 1106.0 \n", "2 -122.24 37.85 52.0 1467.0 190.0 \n", "3 -122.25 37.85 52.0 1274.0 235.0 \n", "4 -122.25 37.85 52.0 1627.0 280.0 \n", "\n", " population households median_income median_house_value ocean_proximity \n", "0 322.0 126.0 8.3252 452600.0 NEAR BAY \n", "1 2401.0 1138.0 8.3014 358500.0 NEAR BAY \n", "2 496.0 177.0 7.2574 352100.0 NEAR BAY \n", "3 558.0 219.0 5.6431 341300.0 NEAR BAY \n", "4 565.0 259.0 3.8462 342200.0 NEAR BAY " ] }, "execution_count": 86, "metadata": {}, "output_type": "execute_result" } ], "source": [ "housing = load_housing_data()\n", "housing.head()" ] }, { "cell_type": "code", "execution_count": 87, "metadata": {}, "outputs": [], "source": [ "housing_median_age = tf.feature_column.numeric_column(\"housing_median_age\")" ] }, { "cell_type": "code", "execution_count": 88, "metadata": {}, "outputs": [], "source": [ "age_mean, age_std = X_mean[1], X_std[1] # The median age is column in 1\n", "housing_median_age = tf.feature_column.numeric_column(\n", " \"housing_median_age\", normalizer_fn=lambda x: (x - age_mean) / age_std)" ] }, { "cell_type": "code", "execution_count": 89, "metadata": {}, "outputs": [], "source": [ "median_income = tf.feature_column.numeric_column(\"median_income\")\n", "bucketized_income = tf.feature_column.bucketized_column(\n", " median_income, boundaries=[1.5, 3., 4.5, 6.])" ] }, { "cell_type": "code", "execution_count": 90, "metadata": {}, "outputs": [ { "data": { "text/plain": [ "BucketizedColumn(source_column=NumericColumn(key='median_income', shape=(1,), default_value=None, dtype=tf.float32, normalizer_fn=None), boundaries=(1.5, 3.0, 4.5, 6.0))" ] }, "execution_count": 90, "metadata": {}, "output_type": "execute_result" } ], "source": [ "bucketized_income" ] }, { "cell_type": "code", "execution_count": 91, "metadata": {}, "outputs": [], "source": [ "ocean_prox_vocab = ['<1H OCEAN', 'INLAND', 'ISLAND', 'NEAR BAY', 'NEAR OCEAN']\n", "ocean_proximity = tf.feature_column.categorical_column_with_vocabulary_list(\n", " \"ocean_proximity\", ocean_prox_vocab)" ] }, { "cell_type": "code", "execution_count": 92, "metadata": {}, "outputs": [ { "data": { "text/plain": [ "VocabularyListCategoricalColumn(key='ocean_proximity', vocabulary_list=('<1H OCEAN', 'INLAND', 'ISLAND', 'NEAR BAY', 'NEAR OCEAN'), dtype=tf.string, default_value=-1, num_oov_buckets=0)" ] }, "execution_count": 92, "metadata": {}, "output_type": "execute_result" } ], "source": [ "ocean_proximity" ] }, { "cell_type": "code", "execution_count": 93, "metadata": {}, "outputs": [ { "data": { "text/plain": [ "HashedCategoricalColumn(key='city', hash_bucket_size=1000, dtype=tf.string)" ] }, "execution_count": 93, "metadata": {}, "output_type": "execute_result" } ], "source": [ "# Just an example, it's not used later on\n", "city_hash = tf.feature_column.categorical_column_with_hash_bucket(\n", " \"city\", hash_bucket_size=1000)\n", "city_hash" ] }, { "cell_type": "code", "execution_count": 94, "metadata": {}, "outputs": [], "source": [ "bucketized_age = tf.feature_column.bucketized_column(\n", " housing_median_age, boundaries=[-1., -0.5, 0., 0.5, 1.]) # age was scaled\n", "age_and_ocean_proximity = tf.feature_column.crossed_column(\n", " [bucketized_age, ocean_proximity], hash_bucket_size=100)" ] }, { "cell_type": "code", "execution_count": 95, "metadata": {}, "outputs": [], "source": [ "latitude = tf.feature_column.numeric_column(\"latitude\")\n", "longitude = tf.feature_column.numeric_column(\"longitude\")\n", "bucketized_latitude = tf.feature_column.bucketized_column(\n", " latitude, boundaries=list(np.linspace(32., 42., 20 - 1)))\n", "bucketized_longitude = tf.feature_column.bucketized_column(\n", " longitude, boundaries=list(np.linspace(-125., -114., 20 - 1)))\n", "location = tf.feature_column.crossed_column(\n", " [bucketized_latitude, bucketized_longitude], hash_bucket_size=1000)" ] }, { "cell_type": "code", "execution_count": 96, "metadata": {}, "outputs": [], "source": [ "ocean_proximity_one_hot = tf.feature_column.indicator_column(ocean_proximity)" ] }, { "cell_type": "code", "execution_count": 97, "metadata": {}, "outputs": [], "source": [ "ocean_proximity_embed = tf.feature_column.embedding_column(ocean_proximity,\n", " dimension=2)" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "### Using Feature Columns for Parsing" ] }, { "cell_type": "code", "execution_count": 98, "metadata": {}, "outputs": [], "source": [ "median_house_value = tf.feature_column.numeric_column(\"median_house_value\")" ] }, { "cell_type": "code", "execution_count": 99, "metadata": {}, "outputs": [ { "data": { "text/plain": [ "{'housing_median_age': FixedLenFeature(shape=(1,), dtype=tf.float32, default_value=None),\n", " 'median_house_value': FixedLenFeature(shape=(1,), dtype=tf.float32, default_value=None)}" ] }, "execution_count": 99, "metadata": {}, "output_type": "execute_result" } ], "source": [ "columns = [housing_median_age, median_house_value]\n", "feature_descriptions = tf.feature_column.make_parse_example_spec(columns)\n", "feature_descriptions" ] }, { "cell_type": "code", "execution_count": 100, "metadata": {}, "outputs": [], "source": [ "with tf.io.TFRecordWriter(\"my_data_with_features.tfrecords\") as f:\n", " for x, y in zip(X_train[:, 1:2], y_train):\n", " example = Example(features=Features(feature={\n", " \"housing_median_age\": Feature(float_list=FloatList(value=[x])),\n", " \"median_house_value\": Feature(float_list=FloatList(value=[y]))\n", " }))\n", " f.write(example.SerializeToString())" ] }, { "cell_type": "code", "execution_count": 101, "metadata": {}, "outputs": [], "source": [ "def parse_examples(serialized_examples):\n", " examples = tf.io.parse_example(serialized_examples, feature_descriptions)\n", " targets = examples.pop(\"median_house_value\") # separate the targets\n", " return examples, targets\n", "\n", "batch_size = 32\n", "dataset = tf.data.TFRecordDataset([\"my_data_with_features.tfrecords\"])\n", "dataset = dataset.repeat().shuffle(10000).batch(batch_size).map(parse_examples)" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "**Warning**: the `DenseFeatures` layer currently does not work with the Functional API, see [TF issue #27416](https://github.com/tensorflow/tensorflow/issues/27416). Hopefully this will be resolved before the final release of TF 2.0." ] }, { "cell_type": "code", "execution_count": 102, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "Epoch 1/5\n", "362/362 [==============================] - 1s 3ms/step - loss: 5.0263 - accuracy: 0.0017\n", "Epoch 2/5\n", "362/362 [==============================] - 1s 2ms/step - loss: 2.1655 - accuracy: 0.0036\n", "Epoch 3/5\n", "362/362 [==============================] - 1s 2ms/step - loss: 1.5688 - accuracy: 0.0028\n", "Epoch 4/5\n", "362/362 [==============================] - 1s 2ms/step - loss: 1.3550 - accuracy: 0.0022\n", "Epoch 5/5\n", "362/362 [==============================] - 1s 2ms/step - loss: 1.3466 - accuracy: 0.0035\n" ] }, { "data": { "text/plain": [ "" ] }, "execution_count": 102, "metadata": {}, "output_type": "execute_result" } ], "source": [ "columns_without_target = columns[:-1]\n", "model = keras.models.Sequential([\n", " keras.layers.DenseFeatures(feature_columns=columns_without_target),\n", " keras.layers.Dense(1)\n", "])\n", "model.compile(loss=\"mse\",\n", " optimizer=keras.optimizers.SGD(lr=1e-3),\n", " metrics=[\"accuracy\"])\n", "model.fit(dataset, steps_per_epoch=len(X_train) // batch_size, epochs=5)" ] }, { "cell_type": "code", "execution_count": 103, "metadata": {}, "outputs": [ { "name": "stderr", "output_type": "stream", "text": [ "WARNING: Logging before flag parsing goes to stderr.\n", "W0314 09:13:03.443593 140735783818112 deprecation.py:323] From /Users/ageron/.virtualenvs/tf2/lib/python3.6/site-packages/tensorflow/python/feature_column/feature_column_v2.py:3038: VocabularyListCategoricalColumn._num_buckets (from tensorflow.python.feature_column.feature_column_v2) is deprecated and will be removed in a future version.\n", "Instructions for updating:\n", "The old _FeatureColumn APIs are being deprecated. Please use the new FeatureColumn APIs instead.\n", "W0314 09:13:03.451946 140735783818112 deprecation.py:323] From /Users/ageron/.virtualenvs/tf2/lib/python3.6/site-packages/tensorflow/python/ops/lookup_ops.py:1347: to_int64 (from tensorflow.python.ops.math_ops) is deprecated and will be removed in a future version.\n", "Instructions for updating:\n", "Use `tf.cast` instead.\n" ] }, { "data": { "text/plain": [ "" ] }, "execution_count": 103, "metadata": {}, "output_type": "execute_result" } ], "source": [ "some_columns = [ocean_proximity_embed, bucketized_income]\n", "dense_features = keras.layers.DenseFeatures(some_columns)\n", "dense_features({\n", " \"ocean_proximity\": [[\"NEAR OCEAN\"], [\"INLAND\"], [\"INLAND\"]],\n", " \"median_income\": [[3.], [7.2], [1.]]\n", "})" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "# TF Transform" ] }, { "cell_type": "code", "execution_count": 104, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "TF Transform is not installed. Try running: pip3 install -U tensorflow-transform\n" ] } ], "source": [ "try:\n", " import tensorflow_transform as tft\n", "\n", " def preprocess(inputs): # inputs is a batch of input features\n", " median_age = inputs[\"housing_median_age\"]\n", " ocean_proximity = inputs[\"ocean_proximity\"]\n", " standardized_age = tft.scale_to_z_score(median_age - tft.mean(median_age))\n", " ocean_proximity_id = tft.compute_and_apply_vocabulary(ocean_proximity)\n", " return {\n", " \"standardized_median_age\": standardized_age,\n", " \"ocean_proximity_id\": ocean_proximity_id\n", " }\n", "except ImportError:\n", " print(\"TF Transform is not installed. Try running: pip3 install -U tensorflow-transform\")" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "# TensorFlow Datasets" ] }, { "cell_type": "code", "execution_count": 105, "metadata": {}, "outputs": [], "source": [ "import tensorflow_datasets as tfds\n", "\n", "datasets = tfds.load(name=\"mnist\")\n", "mnist_train, mnist_test = datasets[\"train\"], datasets[\"test\"]" ] }, { "cell_type": "code", "execution_count": 106, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "['bair_robot_pushing_small', 'cats_vs_dogs', 'celeb_a', 'celeb_a_hq', 'chexpert', 'cifar10', 'cifar100', 'coco2014', 'colorectal_histology', 'colorectal_histology_large', 'diabetic_retinopathy_detection', 'dummy_dataset_shared_generator', 'dummy_mnist', 'fashion_mnist', 'flores_translate_neen', 'flores_translate_sien', 'horses_or_humans', 'image_label_folder', 'imagenet2012', 'imdb_reviews', 'kmnist', 'lm1b', 'lsun', 'mnist', 'moving_mnist', 'multi_nli', 'nsynth', 'omniglot', 'open_images_v4', 'quickdraw_bitmap', 'rock_paper_scissors', 'squad', 'starcraft_video', 'svhn_cropped', 'ted_hrlr_translate', 'ted_multi_translate', 'tf_flowers', 'titanic', 'wmt_translate_ende', 'wmt_translate_enfr']\n" ] } ], "source": [ "print(tfds.list_builders())" ] }, { "cell_type": "code", "execution_count": 107, "metadata": {}, "outputs": [ { "data": { "image/png": "iVBORw0KGgoAAAANSUhEUgAAAXUAAABsCAYAAACVUyIvAAAABHNCSVQICAgIfAhkiAAAAAlwSFlzAAALEgAACxIB0t1+/AAAADl0RVh0U29mdHdhcmUAbWF0cGxvdGxpYiB2ZXJzaW9uIDMuMC4zLCBodHRwOi8vbWF0cGxvdGxpYi5vcmcvnQurowAAD/tJREFUeJzt3XeMVNXfx/H3EVQEREXFggrmEWMHS+zGgu0xGvXBgtgrAvaCDWNUNJaYn0ZQUewNG2J51AB2xIa9EIiKBckiojwKCrb7/DF+uDOzCzuzOzv3zp3PKyHA3Zm7Zy+7h+8553u+J0RRhJmZZcMySTfAzMwqx526mVmGuFM3M8sQd+pmZhniTt3MLEPcqZuZZYg7dTOzDMl0px5CmF/06+8Qws1JtytpIYSeIYTnQgg/hxAaQggjQgjtk25X0kIIXUMIT4YQFoQQvgkhDEi6TWkRQugVQlgYQngg6bYkLYRwWghhSghhUQjhnqTbUyzTnXoURZ31C1gT+B14LOFmpcEtwA/AWkAfYFdgcKItSoeRwB/AGsCRwK0hhE2TbVJqjATeTboRKTELGA7clXRDmpLpTr1IP3Id2etJNyQF1gcejaJoYRRFDcALQF13XiGETuS+Ry6Nomh+FEWTgKeBo5NtWfJCCP2BecCLSbclDaIoGhtF0ThgbtJtaUo9derHAvdFrosAcCPQP4TQMYTQHfhvch17PdsQ+CuKoul51z7C/9l1Aa4Azkm6LVaauujUQwg9yE0x3Jt0W1LiNXKd1S/ATGAKMC7RFiWvM7nnke//gBUTaEuaXAncGUXRzKQbYqWpi06d3BB6UhRFM5JuSNJCCMuQi8rHAp2A1YBVgGuTbFcKzAe6FF3rAvyaQFtSIYTQB9gT+E/SbbHS1UunfgyO0qUrsB4wIoqiRVEUzQXuBvZLtlmJmw60DyH0yrvWG/gsofakwW5AT+DbEEIDcB7QL4TwfpKNsqXLfKceQtgR6I6zXgCIouhHYAYwKITQPoSwMrn1ho+TbVmyoihaQG70ckUIoVMIYSfgQOD+ZFuWqNuB/yKXIdUHuA34X2CfJBuVtH9/bjoA7YB2IYQOaUoJznynTq7DGhtFUd0Oo5vwP8C+wBzgC+BP4OxEW5QOg4EVyGVJPQwMiqKobiP1KIp+i6KoQb/ITVEtjKJoTtJtS9gwcunRFwJH/fvnYYm2KE9wMoiZWXbUQ6RuZlY33KmbmWWIO3Uzswxxp25mliHu1M3MMiSp3Mp6SbkJZb7ez6UxP5PG/Ewa8zP5lyN1M7MMcaduZpYh7tTNzDLEnbqZWYa4UzczyxB36mZmGeJO3cwsQ1JTA7hSbrjhBgDOO++8Jj++wQYbAHDZZZcBMGDAAACWWcb/v5lZy1166aUAjB49GoAJEyYAsNlmm1W1He7JzMwyJDOR+t9//w3AK6+8AsSRd6dOnQpe99VXXwFw9NFHA/DDDz8AMHjwYAA6dOjQ5m1NwksvvQTAvffmTvWbNWsWABMnTgTggAMOAKBfv34AHHvssdVuYtX8+eefALz99tsF14cMGQLAxx/nDoHS98gpp5wCQMeOHQHYaqutqtLOatIzufzyywHo27cvALvvvntZ91G0Onz4cACeeeaZxR/bf//9W93ONHv66acBaGhoAOCcc84B4PnnnwegXbt2VWmHI3UzswxJ6uSjin/STz/9FIDNN98cgO233x6AN998s+B1jzzyCABXXnklANOmTQPiufbJkycDsMoqq1SiWYnVfvn119zpfXfccQcAF110ERBHZL179wZg/fXXB2D8+PEA/PbbbwDcdtttQBylVlgiNT0UMSlCv+qqq5p8nUZ9xZHVaqutBsDdd98NwL777luppkHCdU5GjRoFwKBBgwAYOHAgALfeemtZ91l99dUBmDt3LhBHr9CiSL2mar+oz5g3bx4Q90VTpkwBYLnllqvEp3HtFzOzepKZOXVFYfrf8OCDD27ydYcffnjB74roFb19//33QMUi9cS8/PLLQJwF1LlzZyCOnPbZp/BA+OnTpwMwYsQIAM4//3wAll9+eaC259i1bqB1k5kzZ7boPj/++GPBfTTaGTlyJABrr712q9qZpCeeeKLg74sWLSrr/a+++ioAv/zyS8XaVCsUmWuEJ0OHDgUqFqGXzJG6mVmGuFM3M8uQml8onT17NgA77LADACuvvDIA77//fknvHzNmDABHHHEEACeddBIQLzC2UmILpbvssgsAn3/+OQB33XUXAAceeGBJ7997770BaN8+N0N33XXXAfFQc+edd25N89psAUxTK/q6AQ477DAAFixYUNI9lrRQuiRdunQBYNddd118TRtQunbtWtI9SGhRUNMlW265JQAzZswA4gVSLZguiaZpNtxwQwC+++47IH4mU6dOXfzatdZaq9zm1cRC6f333w/AMcccU3D9gw8+AKBPnz6V/HReKDUzqyc1v1B63333AXGEsddeeyXZnMQpapg0aRIQb6gpNUKXe+65B4BNN90UiNOzLrjgAgCee+65xa+9+uqrW97gCjv99NMBePbZZ8t+r1Lu/vnnH6Dwa1waRbv5G22U9njuueeW3Y5qOuSQQ4D450cjXY1cm6PFY0XoomfZgujcWsmRuplZhtRspD5nzhwAbrnlFiDe3n/hhRe26D6izRO1SvN4okisXNp8NH/+fAB69uwJxJtTevTo0cIWtg2ltH744Ydlv/fQQw8F4MYbbwRA60y658033wzE5QOyQHPdxZvzzjzzTCCeE2+O1lqKKSW2Hrz44osFf9f6wrrrrptEcxypm5llSc1G6l9++SUAX3/9NRDPpe+xxx5l3Ufbo+XUU09tfeMSpIJlUm5207XXXgvEZRS0+ej4448H0hehFxfnKmVjkUYdmjPX9v/iDWfHHXccEGcCffLJJ0Bp292VaaP2Lbvsss2+p63lbyjSz4vaqX9nFXZrjjbpPfDAAyV/fm1sW2GFFQo+V7du3Uq+R5qoFMIbb7xRcH2nnXYCYNVVV616m8CRuplZptRUpP7HH38s/nNxxoXyy0v12GOPAfDtt98C8Vx6tbf0VppK5z711FMAXH/99UDzJVQVoeuQke7duwNwxRVXAHDCCSdUvrEV0FxxrnzKxR43bhxQ+rZ+vU6jHkWYmr8vzvyAeKSjiFj7KJLw+++/A/FoC+LSy6J2llpWWNkyP//8c8H1PffcE4izpfKtuOKKBe048cQTgYrtCakaPU+N2L744gsAQsilkJc62mkrjtTNzDKkpiL1/F2iygnW/FX+br5SKNJXiVoVAFtzzTVb3c4k6WAH5ZMrg0PH9+kQBFGkq0hdkZfWGtIaoYuydJZmm222AeKIsKWFtzR6GTt2LAAnn3wyEOf0N0U7M5OM1N977z0AHn300SW+RvPCKlamEZ+e3UorrQTAwoULAbjmmmuavM/tt98ONH08pErQyl9//VXaF5Ay33zzDQBvvfVWwXWN9pdUTLBaHKmbmWVITUTqmkvPn0dX5KOSoWussUZJ99JqvebBFJnXetZLMR1fp4Mc9OyUHXPkkUcCcWSvwzFURraNDseouIceeghYep2WLbbYAqj+AcAADz/8MBDvfK4m7VkoZZe1Rmga+eh3ZQrpHqpls6Tdtspv10g6v6Sv/q1qnfY1FFONoaQ5Ujczy5CaiNQ1lz5hwoTF15TdUWqE/s477wDxnKF2SmoFO8k5z7aw0UYbAXFEddpppwHw4IMPAvGIRSv2ynrR86kVykgpPqCgqddUmmrELO1zF+cwV5N2V2sePJ9GLwMGDADi75PXXnut4HXKctFceXNKyc3WiClNNYNKoZ+Zzz77rOC6qjCWkoFVDY7UzcwypCYi9XXWWQeAd999d/G1cudHleWhbBcpN7+91myyySZAXCNHecTaESh9+/atbsMqRGsCmrtuinaC6vemcqjLoRowithKrbtebaprpNFY/qhWNV86duwIxMce6rr2NyjDQ/fSqEf3bE5+DRmNDpTDXysVHDWq1zMpHvnpaMhS6+W0NUfqZmYZUvMnHxX76aefABg/fjwQ7xjVzkjVutCcs+YdVftCevXqBcDWW2/dmuYkdvJRMdX92HHHHYHGJ0Np5V4RfRvXrajYiTaqG9/cjlmII0VlZpSbr666MqpNv7SqjToFR9FdCScgVfyUHz0bZZ2olg3AtttuW9In0glSqn+jEZ5OxFImyBlnnNHk+/O/j/QzVYZUnHykn5Ul9QWaBSj1mbaSTz4yM6snNRWp639EgGnTpgFxZK4IUxGpIvSW0gkw+fXVVetD85AlSE2k3tDQADSuY3LTTTcBcPHFFwOw8cYbA/FIp7hyYYVULAJTFURlUgwfPrzZG6rOtXbbynrrrQfEVQRVhU+n+yjnv5RKkNpxWcbJR6mISospG0p56/q+0dpMfkZaG0jFM1F1SZ2qJRod6hlUaW3FkbqZWT1JdfaLojDtfnzhhRcWf6w4i6VUmg9tyQlHac1yKIXmRkU7BVUxb7fddgPgoIMOAmC//fYD4rM+k6oN3RzVKd9uu+2AOApvqnKi6GPFGVRaX9GzUQSWvyuyXqhmzejRowuuK8PjkksuqXqbklK8/iSqi5S2fsGRuplZhqQ6UtdOPZ1Mc9RRRy3xtTqLU+/Rar2oiqMq1dV63fRyFUcbmh9WjWtlhjz++ONAnNWg6o6aV0wrfT29e/cGlh6pL4nmwfU9VG4EpnrtEOcu1xqtSY0ZMwZoXElRVVE1sssy7UJ/8sknC64rU64F2TxV4UjdzCxDUh2pd+jQAYgzW0qhuUA5++yzgbiOeL1F6MWUvaB89WI6+UanuyRZu6QlVGVyyJAhi69p1KGdgZWmOXidqAQtr9metLPOOguA119/veC6vsZSa8BkgSqdzps3r+C6sqOUKZY2jtTNzDLEnbqZWYakevqlHCo4pOG3jh7T5gktCNYrbSJSISZt1tGClxZ/Jk+eDMRDTl3XIQpttBmpYjTtkT/9obLCAwcObNW9O3fuDDQuSaBF5FqdcoH4SL4777yzyY8PHToUiIvr1QNtRiumUiNaNFbJXS2OF5ccqTZH6mZmGVJTZQKWZvbs2QD0798fiBdMVbgrIakpE6CRjKIPHZygEY3MmjUr15B/vy9UJnXYsGGVbE5Vt3+r5K5KHyjS0tdWTCmNKkesdEmNUtroMO5Et8QrHfOjjz4quK7oUxuwyiiRUQmJPhOld+rZTJ06FYgjcf1MqGyxEjvamMsEmJnVk8xE6imVmkhdtLlIqWkTJ05s8nUa6aiMQIVTQRONwFR+YsqUKUt9nebIe/ToUekmNCXRZzJq1CgABg0aBEC3bt2AuDSHjmyrslQU9EoZR+pmZvXEkXrbSl2knhKOwBrzM2nMz6QxR+pmZvXEnbqZWYa4UzczyxB36mZmGeJO3cwsQ5LKfjEzszbgSN3MLEPcqZuZZYg7dTOzDHGnbmaWIe7UzcwyxJ26mVmGuFM3M8sQd+pmZhniTt3MLEPcqZuZZYg7dTOzDHGnbmaWIe7UzcwyxJ26mVmGuFM3M8sQd+pmZhniTt3MLEPcqZuZZYg7dTOzDHGnbmaWIe7UzcwyxJ26mVmGuFM3M8sQd+pmZhniTt3MLEPcqZuZZcj/A5g6NzIdwMajAAAAAElFTkSuQmCC\n", "text/plain": [ "
" ] }, "metadata": { "needs_background": "light" }, "output_type": "display_data" } ], "source": [ "plt.figure(figsize=(6,3))\n", "mnist_train = mnist_train.repeat(5).batch(32).prefetch(1)\n", "for item in mnist_train:\n", " images = item[\"image\"]\n", " labels = item[\"label\"]\n", " for index in range(5):\n", " plt.subplot(1, 5, index + 1)\n", " image = images[index, ..., 0]\n", " label = labels[index].numpy()\n", " plt.imshow(image, cmap=\"binary\")\n", " plt.title(label)\n", " plt.axis(\"off\")\n", " break # just showing part of the first batch" ] }, { "cell_type": "code", "execution_count": 108, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "(32, 28, 28, 1)\n", "[1 3 8 6 0 7 7 9 2 7 3 7 6 0 4 9 8 1 5 6 0 8 0 3 7 2 6 3 3 5 5 4]\n" ] } ], "source": [ "datasets = tfds.load(name=\"mnist\")\n", "mnist_train, mnist_test = datasets[\"train\"], datasets[\"test\"]\n", "mnist_train = mnist_train.repeat(5).batch(32)\n", "mnist_train = mnist_train.map(lambda items: (items[\"image\"], items[\"label\"]))\n", "mnist_train = mnist_train.prefetch(1)\n", "for images, labels in mnist_train.take(1):\n", " print(images.shape)\n", " print(labels.numpy())" ] }, { "cell_type": "code", "execution_count": 109, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "Epoch 1/5\n", "1875/1875 [==============================] - 7s 4ms/step - loss: 31.4932 - accuracy: 0.8428\n", "Epoch 2/5\n", "1875/1875 [==============================] - 7s 4ms/step - loss: 26.2404 - accuracy: 0.8693\n", "Epoch 3/5\n", "1875/1875 [==============================] - 7s 4ms/step - loss: 24.9957 - accuracy: 0.8743\n", "Epoch 4/5\n", "1875/1875 [==============================] - 7s 4ms/step - loss: 24.0785 - accuracy: 0.8780\n", "Epoch 5/5\n", "1875/1875 [==============================] - 7s 4ms/step - loss: 23.8285 - accuracy: 0.8782\n" ] }, { "data": { "text/plain": [ "" ] }, "execution_count": 109, "metadata": {}, "output_type": "execute_result" } ], "source": [ "datasets = tfds.load(name=\"mnist\", batch_size=32, as_supervised=True)\n", "mnist_train = datasets[\"train\"].repeat().prefetch(1)\n", "model = keras.models.Sequential([\n", " keras.layers.Flatten(input_shape=[28, 28, 1]),\n", " keras.layers.Lambda(lambda images: tf.cast(images, tf.float32)),\n", " keras.layers.Dense(10, activation=\"softmax\")])\n", "model.compile(loss=\"sparse_categorical_crossentropy\",\n", " optimizer=keras.optimizers.SGD(lr=1e-3),\n", " metrics=[\"accuracy\"])\n", "model.fit(mnist_train, steps_per_epoch=60000 // 32, epochs=5)" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "# TensorFlow Hub" ] }, { "cell_type": "code", "execution_count": 110, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "Model: \"sequential_3\"\n", "_________________________________________________________________\n", "Layer (type) Output Shape Param # \n", "=================================================================\n", "keras_layer (KerasLayer) (None, 50) 48190600 \n", "_________________________________________________________________\n", "dense_4 (Dense) (None, 16) 816 \n", "_________________________________________________________________\n", "dense_5 (Dense) (None, 1) 17 \n", "=================================================================\n", "Total params: 48,191,433\n", "Trainable params: 833\n", "Non-trainable params: 48,190,600\n", "_________________________________________________________________\n" ] } ], "source": [ "import tensorflow_hub as hub\n", "\n", "hub_layer = hub.KerasLayer(\"https://tfhub.dev/google/tf2-preview/nnlm-en-dim50/1\",\n", " output_shape=[50], input_shape=[], dtype=tf.string)\n", "\n", "model = keras.Sequential()\n", "model.add(hub_layer)\n", "model.add(keras.layers.Dense(16, activation='relu'))\n", "model.add(keras.layers.Dense(1, activation='sigmoid'))\n", "\n", "model.summary()" ] }, { "cell_type": "code", "execution_count": 111, "metadata": {}, "outputs": [], "source": [ "sentences = tf.constant([\"It was a great movie\", \"The actors were amazing\"])\n", "embeddings = hub_layer(sentences)" ] }, { "cell_type": "code", "execution_count": 112, "metadata": {}, "outputs": [ { "data": { "text/plain": [ "" ] }, "execution_count": 112, "metadata": {}, "output_type": "execute_result" } ], "source": [ "embeddings" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [] } ], "metadata": { "kernelspec": { "display_name": "Python 3", "language": "python", "name": "python3" }, "language_info": { "codemirror_mode": { "name": "ipython", "version": 3 }, "file_extension": ".py", "mimetype": "text/x-python", "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", "version": "3.7.3" }, "nav_menu": { "height": "264px", "width": "369px" }, "toc": { "navigate_menu": true, "number_sections": true, "sideBar": true, "threshold": 6, "toc_cell": false, "toc_section_display": "block", "toc_window_display": false } }, "nbformat": 4, "nbformat_minor": 1 }