File size: 6,064 Bytes
3411193 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 |
import streamlit as st
from transformers import pipeline
from transformers.tokenization_utils import TruncationStrategy
import tokenizers
import pandas as pd
import requests
page_title='AlephBERT Demo',
# st.markdown(
# """
# <style>
# .sidebar .sidebar-content {
# background-image: linear-gradient(#3377ff, #80aaff);
# }
# footer {
# color:white;
# visibility: hidden;
# }
# input {
# direction: rtl;
# }
# .stTextInput .instructions {
# color: grey;
# font-size: 9px;}
# </style>
# <div style="color:white; font-size:13px; font-family:monospace;position: fixed; z-index: 1; bottom: 0; right:0; background-color: #f63766;margin:3px;padding:8px;border-radius: 5px;"><a href="" target="_blank" style="text-decoration: none;color: white;">Use aleph-bert in your project </a></div>
# """,
# unsafe_allow_html=True,
# )
models = {
"AlephBERT-base": {
"description":"AlephBERT base model",
"HeBERT-base-TAU": {
"description":"HeBERT model created by TAU"
"mBERT-base-multilingual-cased": {
"description":"Multilingual BERT model"
def get_json_from_url(url):
return models
return requests.get(url).json()
# models = get_json_from_url('')
@st.cache(show_spinner=False, hash_funcs={tokenizers.Tokenizer: str})
def load_model(model):
pipe = pipeline('fill-mask', models[model]['name_or_path'])
def do_tokenize(inputs):
return pipe.tokenizer(
def _parse_and_tokenize(
inputs, tokenized=False, **kwargs
if not tokenized:
inputs = do_tokenize(inputs)
return inputs
pipe._parse_and_tokenize = _parse_and_tokenize
return pipe, do_tokenize
"""<div><a target="_blank" href=""><img src="" style="filter: invert(100%);display: block;margin-left: auto;margin-right: auto;
width: 70%;"></a>
<p style="color:white; font-size:13px; font-family:monospace; text-align: center">AlephBERT Demo • <a href="" style="text-decoration: none;color: white;" target="_blank">ONLP Lab</a></p></div>
mode = 'Models'
if mode == 'Models':
model = st.sidebar.selectbox(
'Select Model',
masking_level = st.sidebar.selectbox('Masking Level:', ['Tokens', 'SubWords'])
n_res = st.sidebar.number_input(
'Number Of Results',
model_tags = model.split('-')
model_tags[0] = 'Model:' + model_tags[0]
st.markdown(''.join([f'<span style="color:white; font-size:13px; font-family:monospace; background-color: #f63766;margin:3px;padding:8px;border-radius: 5px;">{tag}</span>' for tag in model_tags]),unsafe_allow_html=True)
#prepare the model
unmasker, tokenize = load_model(model)
# get inputs
input_text = st.text_input('Insert text you want to mask', '')
if input_text:
input_masked = None
tokenized = tokenize(input_text)
ids = tokenized['input_ids'].tolist()[0]
subwords = unmasker.tokenizer.convert_ids_to_tokens(ids)
if masking_level == 'Tokens':
tokens = str(input_text).split()
masked_token = st.selectbox('Select token to mask:', [''] + tokens)
if masked_token != '':
input_masked = ' '.join(token if token != masked_token else '[MASK]' for token in tokens)
display_input = input_masked
if masking_level == 'SubWords':
tokens = subwords
idx = st.selectbox('Select token to mask:', list(range(0,len(tokens)-1)), format_func=lambda i: tokens[i] if i else '')
tokenized['input_ids'][0][idx] = unmasker.tokenizer.mask_token_id
ids = tokenized['input_ids'].tolist()[0]
display_input = ' '.join(unmasker.tokenizer.convert_ids_to_tokens(ids[1:-1]))
if idx:
input_masked = tokenized
if input_masked:
st.markdown('#### Input:')
ids = tokenized['input_ids'].tolist()[0]
subwords = unmasker.tokenizer.convert_ids_to_tokens(ids)
st.markdown(f'<p dir="rtl">{display_input}</p>',
st.markdown('#### Outputs:')
res = unmasker(input_masked, tokenized=masking_level == 'SubWords', top_k=n_res)
if res:
res = [{'Prediction':r['token_str'], 'Completed Sentence':r['sequence'].replace('[SEP]', '').replace('[CLS]', ''), 'Score':r['score']} for r in res]
res_table = pd.DataFrame(res)
# cols = st.beta_columns(len(tokens))
# genre =
# 'Select token to mask:', tokens)
# for col, token in zip(cols, reversed(tokens)):
# col.text(token)
# st.text(tokens)
# res = unmasker(input_text)
# res_table = pd.DataFrame(res)
# st.table(res_table)
# st.text(res)