'Convert pandas data frame column, which has values of vectors, into tensors
My question is how to convert a vector on pandas data frame into tensors. The data frame has a resume column which has a vector representations of each resume document. I need to convert this column of the dataset into Tensors. The code is here below.
The resume column has a list of numbers or we can say vectors and the category column of the data frame has scalar values. I tried to convert into tensors in this way:
tf.convert_to_tensor(output[["Resume"]])
Other approachs I have tried are
numeric_dict_ds = tf.data.Dataset.from_tensor_slices((dict(output[["Resume"]]), output[["Category"]]))
And the last approach was
numeric_dataset = tf.data.Dataset.from_tensor_slices((numeric_features, target))
But None of them is working
import os
import shutil
import tensorflow as tf
import tensorflow_hub as hub
import tensorflow_text as text
from official.nlp import optimization # to create AdamW optimizer
import matplotlib.pyplot as plt
from sklearn.metrics.pairwise import cosine_similarity
from sklearn.preprocessing import LabelEncoder
tf.get_logger().setLevel('ERROR')
warnings.filterwarnings('ignore')
nltk.download('stopwords')
stopwords_set = set(stopwords.words('english')+['``',"''"])
def clean_resume_text(resume):
resume = resume.lower()
resume = re.sub('http\S+\s*',' ',resume) #to remove url
resume = ''.join([w for w in resume if not w.isdigit()]) # remove the digits
resume = re.sub('RT|cc',' ',resume) # to remove RT and cc
resume = re.sub('#\S+','',resume) # to remove hastags
resume = re.sub('@\S+',' ',resume) # to remove mentions
resume = ''.join([w for w in resume if w not in string.punctuation])# to remove puntuations
resume = re.sub('\W',' ',resume)
#resume = ''.join([w for w in resume if w not in stopwords_set])
resume = re.sub(r'[^\x00-\x7f]',r' ',resume)
resume = re.sub('\s+',' ',resume)# to remove extra spaces
return resume
resume_df['Resume']=resume_df.Resume.apply(lambda x: clean_resume_text(x))
bert_preprocess = hub.KerasLayer("https://tfhub.dev/tensorflow/bert_en_uncased_preprocess/3")
bert_encoder = hub.KerasLayer("https://tfhub.dev/tensorflow/bert_en_uncased_L-12_H-768_A-12/4",trainable=True)
def get_sentence_embeding(sentences):
preprocessed_text = bert_preprocess(sentences)
return bert_encoder(preprocessed_text)['pooled_output']
resume_df["Resume"]=resume_df.Resume.apply(lambda x: get_sentence_embeding([x]))
#Save the the vectorized dataframe
resume_df.to_pickle("resume_Embedding.pkl")
output = pd.read_pickle("resume_Embedding.pkl")
encoder=LabelEncoder()
output["Category"]=encoder.fit_transform(output["Category"])
output=tf.convert_to_tensor(output[["Resume","Category"]])
model=tf.keras.Sequential([
keras.Input(shape=output.shape),
keras.layers.Dense(output.shape[0],activation='relu'),
keras.layers.Dense(64,activation='relu'),
keras.layers.Dense(25,activation='softmax')
])
model.compile(optimizer='adam',loss='mse',metrics=['accuracy'])
Solution 1:[1]
import tensorflow as tf
resume = [[1.0, 2.0], [3.0, 4.0], [5.0, 6.0]]
tf.convert_to_tensor(resume, dtype=tf.float32)
Output
<tf.Tensor: shape=(3, 2), dtype=float32, numpy=
array([[1., 2.],
[3., 4.],
[5., 6.]], dtype=float32)>
Take a look at this link
Sources
This article follows the attribution requirements of Stack Overflow and is licensed under CC BY-SA 3.0.
Source: Stack Overflow
Solution | Source |
---|---|
Solution 1 | TFer |