How do online translators work?
Introduction:
They were designed to handle sequence data, in natural language, for text translation and summary tasks. The transforms have not required that the sequence data be processed in any order. For example, the input data is a natural language sentence, transformer does not need to process the beginning before the end. The transformer allows much more parallelization, thus reducing training times.
How does the transportation model work?
Is a statistical model used to estimate the probability of an object moving from one state to another. It is based on the idea that the probability of an object moving from one state to another depends on the distance between the states and the probability that the object is in each state.
Transportation models can be used for a variety of machine learning tasks, such as classification, regression, and prediction.
Advantage and disadvantage of its use
Advantage:
It is a relatively easy model to understand and use.
It is relatively computationally efficient.
It is a versatile model that can be used for a variety of machine learning tasks.
Disadvantage:
May be sensitive to training data.
It can be difficult to interpret.
Example in code:
The dependencies
import os
import numpy as np
import pandas as pd
import re
import time
from joblib import dump, load
try:
%tensorflow_version 2.x
except:
pass
import tensorflow as tf
from tensorflow.keras import layers
import tensorflow_datasets as tfds
from mlearner.nlp import Transformer
from mlearner.nlp import Processor_data
Data Preprocessing
Files
with open("data/idioma-v7.es-en.en",
mode = "r", encoding = "utf-8") as f:
europarl_en = f.read()
with open("data/idioma-v7.es-en.es",
mode = "r", encoding = "utf-8") as f:
europarl_es = f.read()
with open("data/P85-Non-Breaking-Prefix.en",
mode = "r", encoding = "utf-8") as f:
non_breaking_prefix_en = f.read()
with open("data/P85-Non-Breaking-Prefix.en",
mode = "r", encoding = "utf-8") as f:
non_breaking_prefix_es = f.read()
Translation en
Translation es
Cleaning the data:
def Function_clean(text):
# Eliminamos la @ y su mención
text = r.sub(r"@[A-Za-z0-9]+", ' ', text)
# Eliminamos los url
text = r.sub(r"https?://[A-Za-z0-9./]+", ' ', text)
return text
Processed Data for each Language
proces_en= Proces_data(target_vocab_size=2**13,
language="en",
function = Function_clean,
name="processor_en",
)
proces_es = Proces_data(target_vocab_size=2**13,
language="es",
function = Function_clean,
name="processor_es"
)
if not os.path.isfile(""):
corpus_en = europarl_en
corpus_en = processor_en.clean()
pd.DataFrame(corpus_en).to_csv("", index=False)
if not os.path.isfile(""):
corpus_es = europarl_es
corpus_es = processor_es.clean()
pd.DataFrame(corpus_es).to_csv("", index=False)
corpus_en = pd.read_csv("")
corpus_es = pd.read_csv("")
Text Fields For Each Language
Dataframe en
.dataframe tbody tr th {
vertical-align: top;
}
.dataframe thead th {
text-align: right;
}
Dataframe es
.dataframe tbody tr th {
vertical-align: top;
}
.dataframe thead th {
text-align: right;
}
Returned value
if not os.path.isfile(''):
token_en = proces_en.proces_text(corpus_en,
isclean=True,
padding=False)
dump(proces_en, '')
else:
proces_en = load('')
if not os.path.isfile(''):
token_es = proces_es.process_text(corpus_es,
isclean=True,
padding=False)
dump(proces_es, '')
else:
proces_es = load('')
Text Size
if not os.path.isfile("") and not os.path.isfile(""):
VOCAB_SIZE_EN = proces_en.tokenizer.vocab_size + 2
VOCAB_SIZE_ES = proces_es.tokenizer.vocab_size + 2
print(VOCAB_SIZE_EN, VOCAB_SIZE_ES)
for empty fields
if not os.path.isfile("") and not os.path.isfile(""):
corpus_es = corpus_es.fillna(" ")
corpus_en = corpus_en.fillna(" ")
Preparing The Fields
if not os.path.isfile("") and not os.path.isfile(""):
inputs = [[VOCAB_SIZE_EN-2] + \
proces_en.tokenizer.encode(sentence[0]) + [VOCAB_SIZE_EN-1] \
for sentence in corpus_en.values]
outputs = [[VOCAB_SIZE_ES-2] + \
proces_es.tokenizer.encode(sentence[0]) + [VOCAB_SIZE_ES-1]
for sentence in corpus_es.values ]
len(inputs), len(outputs)
MAX_LENGTH = 20
if not os.path.isfile("") and not os.path.isfile(""):
idx_to_remove = [count for count, sent in enumerate(inputs)
if len(sent) > MAX_LENGTH]
if len(idx_to_remove) > 0:
for idx in reversed(idx_to_remove):
del inputs[idx]
del outputs[idx]
idx_to_remove = [count for count, sent in enumerate(outputs)
if len(sent) > MAX_LENGTH]
if len(idx_to_remove) > 0:
for idx in reversed(idx_to_remove):
del inputs[idx]
del outputs[idx]
pd.DataFrame(inputs).to_csv("", index=False)
pd.DataFrame(outputs).to_csv("", index=False)
inputs and outputs
Inputs
inputs = pd.read_csv("").fillna(0).astype(int)
outputs = pd.read_csv("").fillna(0).astype(int)
len(inputs), len(outputs)
Outpus
MAX_LENGTH = 20
VOCAB_SIZE_EN = 8198
VOCAB_SIZE_ES = 8225
inputs = tf.keras.preprocessing.sequence.pad_sequences(inputs.values,
value=0,
padding='post',
maxlen=MAX_LENGTH)
outputs = tf.keras.preprocessing.sequence.pad_sequences(outputs.values,
value=0,
padding='post',
maxlen=MAX_LENGTH)
Dataset input/outpus
BATCH_SIZE = 64
BUFFER_SIZE = 20000
dataset = tf.data.Dataset.from_tensor_slices((inputs, outputs))
dataset = dataset.cache()
dataset = dataset.shuffle(BUFFER_SIZE).batch(BATCH_SIZE)
dataset = dataset.prefetch(tf.data.experimental.AUTOTUNE)
Model Transformer
tf.keras.backend.clear_session()
D_MODEL = 100
NB_LAYERS = 4
FFN_UNITS = 256
NB_PROJ = 8 # 8
DROPOUT_RATE = 0.1 # 0.1
model_Transformer = Transformer(vocab_size_enc=VOCAB_SIZE_EN,
vocab_size_dec=VOCAB_SIZE_ES,
d_model=D_MODEL,
nb_layers=NB_LAYERS,
FFN_units=FFN_UNITS,
nb_proj=NB_PROJ,
dropout_rate=DROPOUT_RATE)
Trainign Loop
from mlearner.nlp import Transformer_train
EPOCHS = 5
Transformer_train(model_Transformer,
dataset,
d_model=D_MODEL,
train=TRAIN,
epochs=EPOCHS,
checkpoint_path="ckpt/",
max_to_keep=5)
Processing Evaluation
def evaluate(inp_sentence):
inp_sentence = \
[VOCAB_SIZE_EN-2] + processor_en.tokenizer.encode(inp_sentence) + [VOCAB_SIZE_EN-1]
enc_input = tf.expand_dims(inp_sentence, axis=0)
output = tf.expand_dims([VOCAB_SIZE_ES-2], axis=0)
for _ in range(MAX_LENGTH):
predictions = model_Transformer(enc_input, output, False)
prediction = predictions[:, -1:, :]
predicted_id = tf.cast(tf.argmax(prediction, axis=-1), tf.int32)
if predicted_id == VOCAB_SIZE_ES-1:
return tf.squeeze(output, axis=0)
output = tf.concat([output, predicted_id], axis=-1)
return tf.squeeze(output, axis=0)
Function
def translate(sentence):
output = evaluate(sentence).numpy()
predicted_sentence = processor_es.tokenizer.decode(
[i for i in output if i < VOCAB_SIZE_ES-2]
)
print(": {}".format(sentence))
print(": {}".format(predicted_sentence))
Conclusion:
It is observed in the training phase, the percentage of correct answers is around almost 30% for 7training epochs, therefore it is to be expected that the predictions are not completely accurate. But the important thing in this case is to analyze whether the model has been able to learn from the context, so even if it fails with some word we observe that it has inferred the continuation of the phrase with enough relation to the general context of the phrase.