Spaces:
Runtime error
Runtime error
import numpy as np | |
from sentence_transformers import SentenceTransformer, util | |
from bertopic.representation import KeyBERTInspired, MaximalMarginalRelevance | |
import nltk | |
from nltk.corpus import stopwords | |
from nltk.tokenize import word_tokenize | |
import string | |
import re | |
nltk.download('stopwords') | |
nltk.download('punkt') | |
STOP_WORDS = list(stopwords.words('english')) | |
BERTOPIC_REPRESENTATIONS = [ | |
"KeyBERTInspired", | |
"MaximalMarginalRelevance", | |
] | |
TRANSFORMERS = ["all-mpnet-base-v2", "multi-qa-mpnet-base-dot-v1"] | |
TRANSFORMERS_INFO = ["all-mpnet-base-v2: All-round model tuned for many use-cases. " | |
"Trained on a large and diverse dataset of over 1 billion training pairs", | |
"multi-qa-mpnet-base-dot-v1: This model was tuned for semantic search: Given a query/question, " | |
"if can find relevant passages. " | |
"It was trained on a large and diverse set of (question, answer) pairs." | |
] | |
def get_bertopic_representation(representation: str): | |
if representation == BERTOPIC_REPRESENTATIONS[0]: | |
return KeyBERTInspired() | |
elif representation == BERTOPIC_REPRESENTATIONS[1]: | |
return MaximalMarginalRelevance() | |
else: | |
return None | |
def tokenize_explode(df, col): | |
df['tokenized'] = df[col].apply(word_tokenize) | |
df = df.explode('tokenized') | |
df['tokenized'] = df['tokenized'].str.strip() | |
df['tokenized'] = df['tokenized'].str.lower() | |
return df | |
def cleanup_tokens(df, col): | |
df = df[df[col].apply(lambda x: len(x) > 2)] | |
df = df[~df[col].str.contains(r'^(\d+\.?\d*)$', regex=True)] | |
df = df[~df[col].isin([p for p in string.punctuation])] | |
df = df[df[col].isin(STOP_WORDS) == False] | |
return df | |
def get_embedding_model(transformer=BERTOPIC_REPRESENTATIONS[0]) -> SentenceTransformer: | |
""" | |
get given sentence transformer model | |
:param transformer: | |
:return: | |
""" | |
sentence_model = SentenceTransformer(transformer) | |
return sentence_model | |
def str_to_vector_list(text_list, sentence_model, replace_dict=None): | |
""" | |
embedding for the given text list using provided embedding model | |
:param text_list: | |
:param sentence_model: | |
:param replace_dict: any values in the string that we may need to replace | |
:return: | |
""" | |
text_list = [str(x).replace('[^\w\s]', '') for x in text_list] | |
if replace_dict: | |
for stp in replace_dict: | |
text_list = [str(x).replace(stp, replace_dict[stp]) for x in text_list] | |
embeddings = sentence_model.encode(text_list, show_progress_bar=True, batch_size=1000) | |
return embeddings.tolist() | |
def remove_unnecessary_tokens_from_df(df, columns, extra_stopwords=None) -> None: | |
""" | |
removes unnecessary token from the given columns of the dataframe | |
:param df: | |
:param columns: | |
:param extra_stopwords: | |
:return: | |
""" | |
df[columns] = df[columns].apply(lambda x: x.str.replace('[^\w\s]', '')) | |
if extra_stopwords: | |
for stp in extra_stopwords: | |
df[columns] = df[columns].apply(lambda x: x.str.replace(stp, ' ')) | |
def cosine_sim_matrix(embeddings_a, embeddings_b) -> np.array: | |
""" | |
finds out cosine similarity matrix for the given embeddings | |
:param embeddings_a: | |
:param embeddings_b: | |
:return: numpy array showing the cosine similarity matrix | |
""" | |
return np.array( | |
util.pytorch_cos_sim(embeddings_a, embeddings_b) | |
) | |