'fit keras.model from generator_function

TF 2.x - just for the experience I tried with a simple experimental dataset - to show the problem:

import numpy as np
import tensorflow as tf
import keras
from tensorflow.keras.callbacks import LambdaCallback
import tensorflow_datasets as tfds

data, info = tfds.load('iris',  split='train[:80%]',  
as_supervised=True, with_info=True)
print(info)
features, labels = tuple(zip(*data)) 

# NB: the generator should yield a dictionary for the inputs, and the output as is.
def gen(x_train, y_train):
    print('generator initiated')
    (x_train, y_train)= tfds.load('iris',  shuffle_files=True, as_supervised=True, with_info=True)
    idx = 0
    while True:
        yield tf.transpose([x_train[:32], tf.one_hot(y_train[:32])])
        print('generator yielded a batch %d' % idx)
        idx += 1
        
train_ds = tf.data.Dataset.from_generator(gen, args=(features, labels),
                                  output_types=(tf.float32, tf.int32),
                                  output_shapes=(tf.TensorShape([32,4]), tf.TensorShape([32,4 ])),
                                  )
                                  # OR
                                  #output_signature=(
                                  #    tf.TensorSpec(shape=(4,), dtype=tf.float32), 
                                  #    tf.TensorSpec(shape=(), dtype=tf.int32)),
                                  #)
# datasetGen = iter(train_ds)

model = tf.keras.models.Sequential()
model.add(tf.keras.layers.Dense(512, activation='relu', input_shape=(32,4,)))   # 4 fields
model.add(tf.keras.layers.Dense(4, activation='softmax'))

model.compile(optimizer='rmsprop',
                loss='categorical_crossentropy',
                metrics=['accuracy'])

train_ds= train_ds.batch(32).prefetch(32)
# callbacks=[LambdaCallback(on_epoch_end=generator.on_epoch_end)],
history= model.fit(train_ds,  epochs = 7, verbose = 1)
print(history.history['accuracy'])

& am getting :

In ln: yield tf.transpose([x_train[:32], tf.one_hot(y_train[:32])]) TypeError: unhashable type: 'slice'

problem seems to be here - x_train[:32] ?

Q ?? how to make corrections to the code (either to the generator-func? or to the output_signature? or to the input_shape=? or somewhere else) to be able to use Dataset in model.fit() method ?

(sorry for dummy example, but I'd like to test generator-func use in model.fit())



Solution 1:[1]

well, it was really a dummy example of generator use; & moreover tf.data always win in speed compared with generator use. Nevertheless, such works (code also needs refactoring - e.g. or organizing pipelines for BigData - e.g.)

import tensorflow as tf
import numpy as np
import pandas as pd

# LOAD DATA
df= pd.read_csv('https://gist.githubusercontent.com/netj/8836201/raw/6f9306ad21398ea43cba4f7d537619d0e07d5ae3/iris.csv', dtype = 'float32', converters = {'variety' : str},  
                nrows=64, decimal='.')
# df.head()
_features=df.iloc[:,:4].copy()
_labels=df.iloc[:,-1:].copy()
_labels['variety1'] = pd.factorize(_labels['variety'])[0]
_target= _labels['variety1'].astype(np.int64).copy()
_targets= _target[:,np.newaxis]
#print(_features)
print(type(_targets))

# SPLIT for Train & Test
# https://www.kdnuggets.com/2020/07/getting-started-tensorflow2.html
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(_features,_targets, test_size=0.3)
# Typically, we normalize the data when we have a high amount of variance in it.
print(X_train.var())
print(X_test.var())
# Here we can see that both X_train and X_test have very low variance, so no need to normalize the data.

# PREPROCESSING
# 
# to_categorical
y_train = tf.keras.utils.to_categorical(y_train)
y_test = tf.keras.utils.to_categorical(y_test)
print(y_train[:5,:])

# convert our data to numpy arrays
X_train = X_train.values
X_test = X_test.values
#################################################
#################################################

def gen(_features, _labels):
    x_train= _features
    y_train= _labels

    #print('gen:\n', list(x_train))
    #print('gen:\n', list(y_train))

    idx = 0
    while idx<64:
        yield x_train[:32], y_train[:32]
        print('generator yielded a batch %d' % idx)
        idx += 1

#################################################
# train_ds <<<<<<<<<<<<<<<<<<<<<<<
train_ds = tf.data.Dataset.from_generator(gen, args=(X_train, y_train), 
                                  output_types=(tf.float32, tf.int64),
                                  output_shapes=(tf.TensorShape([32,4]), tf.TensorShape([32, 2 ])),

                                  )
                                  # OR
                                  #output_signature=(
                                  #    tf.TensorSpec(shape=(4,), dtype=tf.float32), 
                                  #    tf.TensorSpec(shape=(), dtype=tf.int32)),
                                  #)
# datasetGen = iter(train_ds)
# print('train_ds:\n',list(train_ds.as_numpy_iterator()))
#################################################
# Model    
model = tf.keras.models.Sequential()
model.add(tf.keras.layers.Dense((512), activation='relu', input_shape=(32,4 )))   # 4 fields
model.add(tf.keras.layers.Dense((2), activation='softmax'))

# INSTEAD OF ONE-HOT CAN USE sparse_categorical_crossentropy HERE
model.compile(optimizer='rmsprop',
                loss='categorical_crossentropy',
                metrics=['accuracy'])

train_ds= train_ds.batch(32).prefetch(32)
# callbacks=[LambdaCallback(on_epoch_end=generator.on_epoch_end)],
history= model.fit(train_ds,  epochs = 7, verbose = 1)

validation_ ds from source X_test, y_test formed with tf.data.Dataset.from_tensor_slices() have problems with shape (4,) instead of model's input shape (32,4,) - but it is of the inappropriate generator's task at all from the very beginningg, I think... though with train_ds evaluate() & predict() methods works (though that is not the task of ML)

##############################################
score = model.evaluate(train_ds, batch_size=32, verbose=1)   # test_ds needed
print("Test Accuracy:", score[1])

y_pred = model.predict(train_ds)
print('PREDICTIONS:\n', y_pred)

##############################################
#https://medium.com/@nutanbhogendrasharma/tensorflow-deep-learning-model-with-iris-dataset-8ec344c49f91
#Print actual and predicted value
features, labels = tuple(zip(*train_ds))     # If you need the numpy array version, convert them using np.array():     # https://stackoverflow.com/a/65499385/15893581
actual = np.argmax(labels,axis=-1)
predicted = np.argmax(y_pred,axis=-1)
print(f"Actual: {actual}")
print(f"Predicted: {predicted}")

So, incoming test_ds e.g. still needs to be adopted (though better to adopt gen_func here, I think), but overall idea of using generator in TF 2.x is clear now (only if will be used for huge data)...

P.S. and advice to improve the model here

I apologize for this dummy question, as I'm still a novice in ML, but needed to connect somehow generator & training for the experience

Solution 2:[2]

Finally I generated iris_dataset from function (really, not quick operation)... some attention stll needed else to repeat-fn, but code-design in general works (for really random data)

# Importing the tensorflow library
import tensorflow as tf 
import numpy as np
import keras

#FeaturesDict({
#    'features': Tensor(shape=(4,), dtype=tf.float32),
#    'label': ClassLabel(shape=(), dtype=tf.int64, num_classes=3),
#})

BATCH_SIZE= 12
EPOCHS = 7
QTY_BATCHES= 10    #  to be generated

# The Dataset.from_generator constructor converts the python generator to a fully functional tf.data.Dataset.
def gen():
    for i in range(BATCH_SIZE): 
      # should yield a pair Features - Label
        data= np.expand_dims(np.random.sample(4) , axis=0)  
        label= [np.random.randint(3)]     
        yield data, label
          
train_ds = tf.data.Dataset.from_generator(gen,
                                  (tf.float32, tf.int32),
                                  (tf.TensorShape([None,4]),
                                  tf.TensorShape([ 1])))

# Applying the Dataset.repeat() transformation with no arguments will repeat the input indefinitely.
# The Dataset.repeat transformation concatenates its arguments without signaling the end of one epoch and the beginning of the next epoch. Because of this a Dataset.batch applied after Dataset.repeat will yield batches that straddle epoch boundaries:
train_ds= train_ds.repeat(count= EPOCHS*BATCH_SIZE*QTY_BATCHES).batch(BATCH_SIZE, drop_remainder=True).prefetch(BATCH_SIZE)

NUM_CLASSES= 3
train_ds = train_ds.map(lambda x, y: (x, tf.one_hot(y, depth=NUM_CLASSES)))

for x, y in train_ds:
    print(x)
    print(y)

# Build a simple linear model
model = tf.keras.models.Sequential()
model.add(tf.keras.layers.Dense(64, activation='relu', input_shape=(None,4)))   # unknown(variable) batch_size, 4 fields
model.add(tf.keras.layers.Dense(3, activation='softmax'))

model.compile(optimizer='rmsprop',
                loss='categorical_crossentropy',
                metrics=['accuracy'])

# steps_per_epoch = int( np.ceil(x_train.shape[0] / batch_size) )
# The Steps per epoch denote the number of batches to be selected for one epoch. If 500 steps are selected then the network will train for 500 batches to complete one epoch. 
history= model.fit(train_ds, batch_size=BATCH_SIZE, epochs= EPOCHS, \
                   steps_per_epoch= (QTY_BATCHES*BATCH_SIZE)//BATCH_SIZE, \
                   verbose = 1) 
print(history.history['accuracy'])
print(history.history['loss'])

# Keras - Plot training, validation and test set accuracy
# https://stackoverflow.com/questions/41908379/keras-plot-training-validation-and-test-set-accuracy
import keras
from matplotlib import pyplot as plt

plt.plot(history.history['accuracy'])
#plt.plot(history.history['val_acc'])
plt.title('model accuracy')
plt.ylabel('accuracy')
plt.xlabel('epoch')
#plt.legend(['train', 'val'], loc='upper left')
plt.legend(['train'], loc='upper left')
plt.show()

plt.plot(history.history['loss'])
# plt.plot(history.history['val_loss'])
plt.title('model loss')
plt.ylabel('loss')
plt.xlabel('epoch')
# plt.legend(['train', 'val'], loc='upper left')
plt.legend(['train'], loc='upper left')
plt.show()

Solution 3:[3]

ok, I'v got working case for the initial Dataset:

import numpy as np
import tensorflow as tf
import keras
from tensorflow.keras.callbacks import LambdaCallback
import tensorflow_datasets as tfds

data, info = tfds.load('iris',  split='train[:100%]', batch_size=10,  as_supervised=True, with_info=True)
print(info)
NUM_CLASSES= info.features["label"].num_classes

data = data.map(lambda x, y: (x, tf.one_hot(y, depth=NUM_CLASSES)))
features, labels = tuple(zip(*data))
print(features)
print(labels)

# NB: the generator should yield a dictionary for the inputs, and the output as is.
def gen(x_train, y_train):
    print('generator initiated')
    print(x_train.shape)
    print(y_train.shape)
    idx = 0
    while True:
        yield x_train, y_train
        print('generator yielded a batch %d' % idx)
        idx += 1
        
train_ds = tf.data.Dataset.from_generator(gen, args=(features, labels),
                                  output_types=(tf.float32, tf.int32),
                                  output_shapes=(tf.TensorShape([None,10,4]), tf.TensorShape([ None, 10, 3 ])),
                                  )
                                  # OR (better! because prev. is Deprecated)
                                  #output_signature=(
                                  #    tf.TensorSpec(shape=(4,), dtype=tf.float32), 
                                  #    tf.TensorSpec(shape=(), dtype=tf.int32)),
                                  #)
#it = iter(train_ds)
#print(it.get_next())

for feature, label in train_ds:
   print("shape of ds_generated: ", feature.shape,label.shape)
   break

#num_val = len(train_ds)   # TypeError: The dataset length is unknown. BECAUSE it is FLOW
#print(num_val)


model = tf.keras.models.Sequential()
model.add(tf.keras.layers.Dense(512, activation='relu', input_shape=(None,10,4)))   # 4 fields
model.add(tf.keras.layers.Dense(3, activation='softmax'))

model.compile(optimizer='rmsprop',
                loss='categorical_crossentropy',
                metrics=['accuracy'])

train_ds= train_ds.batch(32).prefetch(32)
# callbacks=[LambdaCallback(on_epoch_end=generator.on_epoch_end)],
history= model.fit(train_ds,  epochs = 2, steps_per_epoch= 120 // 10, verbose = 1)
print(history.history['accuracy'])
  1. one-hot encoding I've moved out of gen_func-scope
  2. divided DS for features & labels
  3. ! gave correct input_shape to model (& appropriate shape changes in gen_func) - according [variable_rows_count_in_batch, batch_size, columns_features]
  4. verbose = 1 for readable Debug in MT env.

to define a variable batch size with None and setting the steps_per_epoch

-- still not helps if taking split='train[:50%]' and steps_per_epoch= 60 // 10, -- as for unfully filled LAST batch -- the source of problem in my code IS in gen_func output_shapes -- that is clear here, because gen_func really was got dummy for testing purposes...

for real cases use Logical Output ! and appropriate Shapes

P.S. though for 5 epochs I am getting:

Graph execution error: >> ZMQError: Too many open file

AttributeError: '_thread._local' object has no attribute 'event_pipe'

-- ! probably, NOT enough memory to finish training !... - decreasing output in Dense(512,..) HELPS (as well as decreasing number of epochs)

Sources

This article follows the attribution requirements of Stack Overflow and is licensed under CC BY-SA 3.0.

Source: Stack Overflow

Solution Source
Solution 1
Solution 2 JeeyCi
Solution 3