The code below was derived from the YouTube video https: //www.youtube.com/watch?v=XNKeayZW4dY This is definitely not a tutorial for REAL beginners! """ https: //www.youtube.com/watch?v=XNKeayZW4dY""" from __future__ import absolute_import from __future__ import division from __future__ import print_function import numpy as np import pandas as pd import tensorflow as tf from datetime import datetime from sklearn.preprocessing import LabelEncoder from tensorflow import keras startTime = datetime.now() layers = keras.layers print("You have TensorFlow ", tf.__version__) URL = "https://storage.googleapis.com/sara-cloud-ml/wine_data.csv" path = tf.keras.utils.get_file(URL.split('/')[-1], URL) data = pd.read_csv(path)# convert to Pandas frame data = data.sample(frac = 1)# shuffle the data print(data.head())# print first 5 rows # limit the varieties in the dataset data = data[pd.notnull(data['country'])] data = data[pd.notnull(data['price'])] data = data.drop(data.columns[0], axis = 1) variety_threshold = 500 value_counts = data['variety'].value_counts() to_remove = value_counts[value_counts <= variety_threshold].index data.replace(to_remove, np.nan, inplace = True) data = data[pd.notnull(data['variety'])] # split data into train & test train_size = int(len(data) * 0.8) print("Train size: %d" % train_size) print("Test size: %d" % (len(data) - train_size)) # train features description_train = data['description'][: train_size] variety_train = data['variety'][: train_size] labels_train = data['price'][: train_size] # test features description_test = data['description'][train_size: ] variety_test = data['variety'][train_size: ] labels_test = data['price'][train_size: ] # Create a tokenizer vocab_size = 12000# it 's a hyperparameter, can adjust tokenize = keras.preprocessing.text.Tokenizer(num_words = vocab_size, char_level = False) tokenize.fit_on_texts(description_train)# only fit on train # Wide feature 1: sparse BOW vocab_size vector description_bow_train = tokenize.texts_to_matrix(description_train) description_bow_test = tokenize.texts_to_matrix(description_test) # Wide feature 2: one-hot vector using sklearn encoder = LabelEncoder() encoder.fit(variety_train) variety_train = encoder.transform(variety_train) variety_test = encoder.transform(variety_test) num_classes = np.max(variety_train) + 1 # Convert labels to one hot variety_train = keras.utils.to_categorical(variety_train, num_classes) variety_test = keras.utils.to_categorical(variety_test, num_classes) # define out models bow_inputs = layers.Input(shape = (vocab_size, )) variety_inputs = layers.Input(shape = (num_classes, )) merged_layer = layers.concatenate([bow_inputs, variety_inputs]) merged_layer = layers.Dense(256, activation = 'relu')(merged_layer) predictions = layers.Dense(1)(merged_layer) wide_model = keras.Model(inputs = [bow_inputs, variety_inputs], outputs = predictions) wide_model.compile(loss = 'mse', optimizer = 'adam', metrics = ['accuracy']) print(wide_model.summary()) # Deep model feature: word embeddings of descriptions train_embed = tokenize.texts_to_sequences(description_train) test_embed = tokenize.texts_to_sequences(description_test) max_seq_length = 170 train_embed = keras.preprocessing.sequence.pad_sequences( train_embed, maxlen = max_seq_length, padding = "post") test_embed = keras.preprocessing.sequence.pad_sequences( test_embed, maxlen = max_seq_length, padding = "post") # define the deep models deep_inputs = layers.Input(shape = (max_seq_length, )) embedding = layers.Embedding(vocab_size, 8, input_length = max_seq_length)(deep_inputs) embedding = layers.Flatten()(embedding) embed_out = layers.Dense(1)(embedding) deep_model = keras.Model(inputs = deep_inputs, outputs = embed_out) print(deep_model.summary()) deep_model.compile(loss = 'mse', optimizer = 'adam', metrics = ['accuracy']) # combine deep and wide merged_out = layers.concatenate([wide_model.output, deep_model.output]) merged_out = layers.Dense(1)(merged_out) combined_model = keras.Model(wide_model.input + [deep_model.input], merged_out) print(combined_model.summary()) combined_model.compile(loss = 'mse', optimizer = 'adam', metrics = ['accuracy']) # Run training combined_model.fit([description_bow_train, variety_train] + [train_embed], labels_train, epochs = 10, batch_size = 128) combined_model.evaluate([description_bow_test, variety_test] + [test_embed], labels_test, batch_size = 128) print("************************************ LINE 120") # Generate predictions predictions = combined_model.predict([description_bow_test, variety_test] + [test_embed]) num_predictions = 40 diff = 0 for i in range(num_predictions): val = predictions[i] print(description_test.iloc[i]) print('Predicted: ', val[0], 'Actual: ', labels_test.iloc[i], '\n') diff += abs(val[0] - labels_test.iloc[i]) print('Average prediction difference: ', diff / num_predictions) print("Time taken:", datetime.now() - startTime) print("\n"*10) |
Blog >