'Use Keras Neural Translation Model (h5 format model) in spacy v3

first 'im not a developer by trade, my developer is not available for health reasons but i have some experience in python/spacy development. I need some guidance in this problem: I have a h5 keras model which was written to translate some dead language from its latin alphabet written form into arabic "letters" using neural machine translation.

here's the code for creating and training the keras model (dataset.txt is some tab delimited sentences, one in latin alphabet and the other one in arabic):

import string
import re
from unicodedata import normalize
from numpy import array
from pickle import load
from pickle import dump
from numpy.random import rand
from numpy.random import shuffle
from numpy import argmax
import tensorflow as tf
from keras.preprocessing.text import Tokenizer
from keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.utils import to_categorical
from keras.models import Sequential
from keras.layers import LSTM
from keras.layers import Dense
from keras.layers import Embedding
from keras.layers import RepeatVector
from keras.layers import TimeDistributed
from keras.callbacks import ModelCheckpoint
from keras.models import load_model

# load doc into memory
def load_doc(filename):
    # open the file as read only
    file = open(filename, mode='rt', encoding='utf-8')
    # read all text
    text = file.read()
    # close the file
    file.close()
    return text

# split a loaded document into sentences
def to_pairs(doc):
    lines = doc.strip().split('\n')
    pairs = [line.split('\t') for line in  lines]
    return pairs

# clean a list of lines
def clean_pairs(lines):
    cleaned = list()
    for pair in lines:
        clean_pair = list()
        for line in pair:
            clean_pair.append(' '.join(line))
        cleaned.append(clean_pair)
    return array(cleaned)

# save a list of clean sentences to file
def save_clean_data(sentences, filename):
    dump(sentences, open(filename, 'wb'))
    print('Saved: %s' % filename)

# load dataset
filename = 'dataset.txt'
doc = load_doc(filename)
# split into latin-bok pairs
pairs = to_pairs(doc)
# clean sentences
clean_pairs = clean_pairs(pairs)
# save clean pairs to file
save_clean_data(clean_pairs, 'latin-bok.pkl')
# spot check
for i in range(100):
    print('[%s] => [%s]' % (clean_pairs[i,0], clean_pairs[i,1]))

# load a clean dataset
def load_clean_sentences(filename):
    return load(open(filename, 'rb'))

# save a list of clean sentences to file
def save_clean_data(sentences, filename):
    dump(sentences, open(filename, 'wb'))
    print('Saved: %s' % filename)

# load dataset
raw_dataset = load_clean_sentences('latin-bok.pkl')

# reduce dataset size
n_sentences = 10000
dataset = raw_dataset[:, :]
# random shuffle
shuffle(dataset)
# split into train/test
train, test = dataset[:70000], dataset[70000:]
# save
save_clean_data(dataset, 'latin-bok-both.pkl')
save_clean_data(train, 'latin-bok-train.pkl')
save_clean_data(test, 'latin-bok-test.pkl')

# load a clean dataset
def load_clean_sentences(filename):
    return load(open(filename, 'rb'))

# fit a tokenizer
def create_tokenizer(lines):
    tokenizer = Tokenizer()
    tokenizer.fit_on_texts(lines)
    return tokenizer

# max sentence length
def max_length(lines):
    return max(len(line.split()) for line in lines)

# encode and pad sequences
def encode_sequences(tokenizer, length, lines):
    # integer encode sequences
    X = tokenizer.texts_to_sequences(lines)
    # pad sequences with 0 values
    X = pad_sequences(X, maxlen=length, padding='post')
    return X

# one hot encode target sequence
def encode_output(sequences, vocab_size):
    ylist = list()
    for sequence in sequences:
        encoded = to_categorical(sequence, num_classes=vocab_size)
        ylist.append(encoded)
    y = array(ylist)
    y = y.reshape(sequences.shape[0], sequences.shape[1], vocab_size)
    return y

# define NMT model
def define_model(src_vocab, tar_vocab, src_timesteps, tar_timesteps, n_units):
    model = Sequential()
    model.add(Embedding(src_vocab, n_units, input_length=src_timesteps, mask_zero=True))
    model.add(LSTM(n_units))
    model.add(RepeatVector(tar_timesteps))
    model.add(LSTM(n_units, return_sequences=True))
    model.add(TimeDistributed(Dense(tar_vocab, activation='softmax')))
    return model

# load datasets
dataset = load_clean_sentences('latin-bok-both.pkl')
train = load_clean_sentences('latin-bok-train.pkl')
test = load_clean_sentences('latin-bok-test.pkl')

# prepare english tokenizer
latin_tokenizer = create_tokenizer(dataset[:, 0])
latin_vocab_size = len(latin_tokenizer.word_index) + 1
latin_length = max_length(dataset[:, 0])
print('latin Vocabulary Size: %d' % latin_vocab_size)
print('latin Max Length: %d' % (latin_length))
# prepare german tokenizer
bok_tokenizer = create_tokenizer(dataset[:, 1])
bok_vocab_size = len(bok_tokenizer.word_index) + 1
bok_length = max_length(dataset[:, 1])
print('bok Vocabulary Size: %d' % bok_vocab_size)
print('bok Max Length: %d' % (bok_length))

# prepare training data
trainX = encode_sequences(bok_tokenizer, bok_length, train[:, 1])
trainY = encode_sequences(latin_tokenizer, latin_length, train[:, 0])
trainY = encode_output(trainY, latin_vocab_size)
# prepare validation data
testX = encode_sequences(bok_tokenizer, bok_length, test[:, 1])
testY = encode_sequences(latin_tokenizer, latin_length, test[:, 0])
testY = encode_output(testY, latin_vocab_size)

# define model
model = define_model(bok_vocab_size, latin_vocab_size, bok_length, latin_length, 256)
model.compile(optimizer='adam', loss='categorical_crossentropy')
# summarize defined model
print(model.summary())
# fit model
filename = 'translator.h5'
checkpoint = ModelCheckpoint(filename, monitor='val_loss', verbose=1, save_best_only=True, mode='min')
model.fit(trainX, trainY, epochs=30, batch_size=64, validation_data=(testX, testY), callbacks=[checkpoint], verbose=2)

# load a clean dataset
def load_clean_sentences(filename):
    return load(open(filename, 'rb',encoding='utf-8'))

# fit a tokenizer
def create_tokenizer(lines):
    tokenizer = Tokenizer()
    tokenizer.fit_on_texts(lines)
    return tokenizer

# max sentence length
def max_length(lines):
    return max(len(line.split()) for line in lines)

# encode and pad sequences
def encode_sequences(tokenizer, length, lines):
    # integer encode sequences
    X = tokenizer.texts_to_sequences(lines)
    # pad sequences with 0 values
    X = pad_sequences(X, maxlen=length, padding='post')
    return X

# map an integer to a word
def word_for_id(integer, tokenizer):
    for word, index in tokenizer.word_index.items():
        if index == integer:
            return word
    return None

# generate target given source sequence
def predict_sequence(model, tokenizer, source):
    prediction = model.predict(source, verbose=0)[0]
    integers = [argmax(vector) for vector in prediction]
    target = list()
    for i in integers:
        word = word_for_id(i, tokenizer)
        if word is None:
            break
        target.append(word)
    return ' '.join(target)

# evaluate the skill of the model
def evaluate_model(model, tokenizer, sources, raw_dataset):
    actual, predicted = list(), list()
    for i, source in enumerate(sources):
        # translate encoded source text
        source = source.reshape((1, source.shape[0]))
        translation = predict_sequence(model, latin_tokenizer, source)
        raw_target, raw_src = raw_dataset[i]
        if i < 10:
            print('src=[%s], target=[%s], predicted=[%s]' % (raw_src, raw_target, translation))
        actual.append([raw_target.split()])
        predicted.append(translation.split())
    # calculate BLEU score
    
# load datasets
dataset = load_clean_sentences('latin-bok-both.pkl')
train = load_clean_sentences('latin-bok-train.pkl')
test = load_clean_sentences('latin-bok-test.pkl')
# prepare english tokenizer
latin_tokenizer = create_tokenizer(dataset[:, 0])
latin_vocab_size = len(latin_tokenizer.word_index) + 1
latin_length = max_length(dataset[:, 0])
# prepare german tokenizer
bok_tokenizer = create_tokenizer(dataset[:, 1])
bok_vocab_size = len(bok_tokenizer.word_index) + 1
bok_length = max_length(dataset[:, 1])
# prepare data
trainX = encode_sequences(bok_tokenizer, bok_length, train[:, 1])
testX = encode_sequences(bok_tokenizer, bok_length, test[:, 1])

# load model
model = load_model('translator2.h5')
# test on some training sequences
print('train')
evaluate_model(model, latin_tokenizer, trainX, train)
# test on some test sequences
print('test')
evaluate_model(model, latin_tokenizer, testX, test)

The model works fine and is accurate. Now i wanted to implement my model to do this task: What i want to do is to get all the tokens of a doc and "translate" or rather "predict" them and store the translation in the doc object. As i can't do that i tasked a freelance developer to do this but i'm getting desperated as nothing is done correctly. The developer wrote me this code but nothing works:

code.py
from typing import List
from thinc.api import TensorFlowWrapper
from keras.models import load_model
from thinc.api import Model
from spacy.tokens.doc import Doc
import spacy
from spacy.lookups import Lookups
import os
import json 
from spacy.pipeline import TrainablePipe
from spacy.language import Language
from itertools import islice
from typing import Optional
from collections.abc import Callable
from collections.abc import Iterable, Iterator
from spacy.training import Example

@spacy.registry.architectures("LatinBok.v1")
def create_tensorflow_model() -> Model[List[Doc], List[str]]:    
        model = load_model('translator.h5')
        wrapped_model = TensorFlowWrapper(model)
        return wrapped_model

@spacy.registry.misc("mon_lookups_loader")
def load_lookups(data_path):
    lookups = Lookups()
    # "lemma_lookup", "lemma_rules", "lemma_exc", "lemma_index"
    with open(os.path.join('lemma/mon_lookup_lemma.json'), 'r',encoding="utf-8") as lemma_lookup:
        lookups.add_table('lemma_lookup', json.load(lemma_lookup))
    return lookups

class TrainableComponent(TrainablePipe):
    def __init__(self, vocab, model, name="translator"):
        self.model = model
        self.vocab = vocab
        self.name = name

    def predict(self, docs):
        return self.model.predict(docs)

    def set_annotations(self, docs, scores):
        c = 0
        get_instances = self.model.attrs["get_instances"]
        for doc in docs:
             for (e1, e2) in get_instances(doc):            
                offset = (e1.start, e2.start)            
                if offset not in doc._.trans:
                    doc._.trans[offset] = {}
                for j,token in enumerate(doc):
                    doc._.trans[offset][token] = scores[c, j]            
                c += 1

    def initialize(
    self,
    get_examples: Callable[[], Iterable[Example]],
    *,
    nlp: Language = None,
    labels: Optional[List[str]] = None,
):
        if labels is not None:
            for label in labels:
                self.add_label(label)    
        else:
            for example in get_examples():
                relations = example.reference._.trans            
                for indices, label_dict in relations.items():
                    for label in label_dict.keys():
                        self.add_label(label)    
        subbatch = list(islice(get_examples(), 10))
        doc_sample = [eg.reference for eg in subbatch]
        label_sample = self._examples_to_truth(subbatch)
        self.model.initialize(X=doc_sample, Y=label_sample)

    @Language.factory("translator")
    def make_component(nlp, name, model):
        return TrainableComponent(nlp.vocab, model, name=name)

and he also rewrote my config.cfg file into:

[paths]
train = null
dev = null
vectors = null
init_tok2vec = null

[system]
gpu_allocator = null
seed = 0

[nlp]
lang = "bok"
pipeline = ["tok2vec","tagger","morphologizer","lemmatizer","translator"]
batch_size = 1000
disabled = []
before_creation = null
after_creation = null
after_pipeline_creation = null
tokenizer = {"@tokenizers":"spacy.Tokenizer.v1"}

[components]

[components.lemmatizer]
factory = "lemmatizer"
mode = "lookup"
model = null
overwrite = false
scorer = {"@scorers":"spacy.lemmatizer_scorer.v1"}

[components.morphologizer]
factory = "morphologizer"
extend = false
overwrite = true
scorer = {"@scorers":"spacy.morphologizer_scorer.v1"}

[components.morphologizer.model]
@architectures = "spacy.Tagger.v1"
nO = null

[components.morphologizer.model.tok2vec]
@architectures = "spacy.Tok2VecListener.v1"
width = ${components.tok2vec.model.encode.width}
upstream = "*"

[components.tagger]
factory = "tagger"
neg_prefix = "!"
overwrite = false
scorer = {"@scorers":"spacy.tagger_scorer.v1"}

[components.tagger.model]
@architectures = "spacy.Tagger.v1"
nO = null

[components.tagger.model.tok2vec]
@architectures = "spacy.Tok2VecListener.v1"
width = ${components.tok2vec.model.encode.width}
upstream = "*"

[components.tok2vec]
factory = "tok2vec"

[components.tok2vec.model]
@architectures = "spacy.Tok2Vec.v2"

[components.tok2vec.model.embed]
@architectures = "spacy.MultiHashEmbed.v2"
width = ${components.tok2vec.model.encode.width}
attrs = ["NORM","PREFIX","SUFFIX","SHAPE"]
rows = [5000,2500,2500,2500]
include_static_vectors = true

[components.tok2vec.model.encode]
@architectures = "spacy.MaxoutWindowEncoder.v2"
width = 256
depth = 8
window_size = 1
maxout_pieces = 3

[components.translator]
factory = "translator"

[components.translator.model]
@architectures = "LatinBok.v1"

[corpora]

[corpora.dev]
@readers = "spacy.Corpus.v1"
path = ${paths.dev}
max_length = 0
gold_preproc = false
limit = 0
augmenter = null

[corpora.train]
@readers = "spacy.Corpus.v1"
path = ${paths.train}
max_length = 0
gold_preproc = false
limit = 0
augmenter = null

[training]
dev_corpus = "corpora.dev"
train_corpus = "corpora.train"
seed = ${system.seed}
gpu_allocator = ${system.gpu_allocator}
dropout = 0.1
accumulate_gradient = 1
patience = 1600
max_epochs = 0
max_steps = 20000
eval_frequency = 200
frozen_components = []
annotating_components = []
before_to_disk = null

[training.batcher]
@batchers = "spacy.batch_by_words.v1"
discard_oversize = false
tolerance = 0.2
get_length = null

[training.batcher.size]
@schedules = "compounding.v1"
start = 100
stop = 1000
compound = 1.001
t = 0.0

[training.logger]
@loggers = "spacy.ConsoleLogger.v1"
progress_bar = false

[training.optimizer]
@optimizers = "Adam.v1"
beta1 = 0.9
beta2 = 0.999
L2_is_weight_decay = true
L2 = 0.01
grad_clip = 1.0
use_averages = false
eps = 0.00000001
learn_rate = 0.001

[training.score_weights]
tag_acc = 0.33
pos_acc = 0.17
morph_acc = 0.17
morph_per_feat = null
lemma_acc = 0.33

[pretraining]

[initialize]
vectors = ${paths.vectors}
init_tok2vec = ${paths.init_tok2vec}
vocab_data = null
lookups = null
before_init = null
after_init = null

[initialize.components]

[initialize.tokenizer]

[initialize.components.lemmatizer]

[initialize.components.lemmatizer.lookups]
@misc = "bok_lookups_loader"
data_path = 'lemma'

[initialize.components.translator]

and told me to retrain the model using the Boktrain.spacy and Bokdev.spacy (which were connlu files which i used to train the original model) and a json containing the lemmas. He told me to write this in my command line to compile everything:

python -m spacy train config.cfg --output Bok --paths.train Boktrain.spacy --paths.dev Bokdev.spacy --code code.py

Which i did but i got this error:

[2022-05-13 15:01:37,487] [INFO] Set up nlp object from config
[2022-05-13 15:01:37,753] [INFO] Pipeline: ['tok2vec', 'tagger', 'morphologizer', 'lemmatizer', 'translator']
[2022-05-13 15:01:45,619] [INFO] Created vocabulary
[2022-05-13 15:01:45,630] [INFO] Finished initializing nlp object
Traceback (most recent call last):
  File "C:\Users\User\anaconda3\envs\BokModel\lib\runpy.py", line 196, in _run_module_as_main
    return _run_code(code, main_globals, None,
  File "C:\Users\User\anaconda3\envs\BokModel\lib\runpy.py", line 86, in _run_code
    exec(code, run_globals)
  File "C:\Users\User\anaconda3\envs\BokModel\lib\site-packages\spacy\__main__.py", line 4, in <module>
    setup_cli()
  File "C:\Users\User\anaconda3\envs\BokModel\lib\site-packages\spacy\cli\_util.py", line 71, in setup_cli
    command(prog_name=COMMAND)
  File "C:\Users\User\anaconda3\envs\BokModel\lib\site-packages\click\core.py", line 1130, in __call__
    return self.main(*args, **kwargs)
  File "C:\Users\User\anaconda3\envs\BokModel\lib\site-packages\click\core.py", line 1055, in main
    rv = self.invoke(ctx)
  File "C:\Users\User\anaconda3\envs\BokModel\lib\site-packages\click\core.py", line 1657, in invoke
    return _process_result(sub_ctx.command.invoke(sub_ctx))
  File "C:\Users\User\anaconda3\envs\BokModel\lib\site-packages\click\core.py", line 1404, in invoke
    return ctx.invoke(self.callback, **ctx.params)
  File "C:\Users\User\anaconda3\envs\BokModel\lib\site-packages\click\core.py", line 760, in invoke
    return __callback(*args, **kwargs)
  File "C:\Users\User\anaconda3\envs\BokModel\lib\site-packages\typer\main.py", line 500, in wrapper
    return callback(**use_params)  # type: ignore
  File "C:\Users\User\anaconda3\envs\BokModel\lib\site-packages\spacy\cli\train.py", line 45, in train_cli
    train(config_path, output_path, use_gpu=use_gpu, overrides=overrides)
  File "C:\Users\User\anaconda3\envs\BokModel\lib\site-packages\spacy\cli\train.py", line 72, in train
    nlp = init_nlp(config, use_gpu=use_gpu)
  File "C:\Users\User\anaconda3\envs\BokModel\lib\site-packages\spacy\training\initialize.py", line 84, in init_nlp
    nlp.initialize(lambda: train_corpus(nlp), sgd=optimizer)
  File "C:\Users\User\anaconda3\envs\BokModel\lib\site-packages\spacy\language.py", line 1309, in initialize
    proc.initialize(get_examples, nlp=self, **p_settings)
  File "C:\Users\User\Downloads\BokModel\LatinToArabic\test.py", line 65, in initialize
    relations = example.reference._.trans
  File "C:\Users\User\anaconda3\envs\BokModel\lib\site-packages\spacy\tokens\underscore.py", line 47, in __getattr__
    raise AttributeError(Errors.E046.format(name=name))
AttributeError: [E046] Can't retrieve unregistered extension attribute 'trans'. Did you forget to call the `set_extension` method?

I'm desperatly trying just to get a spacy doc object with the translations in it, so that i can use that in my spacy model pipeline or by using a custom spacy extension. I have basic spacy/python understanding, but i'm not understanding anything here. I tried to compile the code with each section and it seems that only the lemma part is correct, everything else seems to be incorrect. If anyone has a solution to this, as simple as it may seem for you, i'm interested.



Solution 1:[1]

Your error just indicates that that particular field isn't registered. You can register it using this - put it at the top of your code.py, after the imports:

Doc.set_extension("trans")

I don't know if you have other problems or not, but that will fix your current error.

If you don't understand something, I would recommend you search the spaCy docs for related topics - there's a lot of documented examples.

Sources

This article follows the attribution requirements of Stack Overflow and is licensed under CC BY-SA 3.0.

Source: Stack Overflow

Solution Source
Solution 1 polm23