|
import streamlit as st |
|
from pathlib import Path |
|
from hindi_tokenizer import load_tokenizer, encode_text, decode_text |
|
|
|
def load_hindi_tokenizer(): |
|
"""Load the trained Hindi BPE tokenizer""" |
|
output_dir = Path("output") |
|
config_path = output_dir / "hindi_encoder.json" |
|
|
|
if not config_path.exists(): |
|
st.error("Error: Tokenizer configuration not found! Please train the tokenizer first.") |
|
st.stop() |
|
|
|
return load_tokenizer(str(config_path)) |
|
|
|
def main(): |
|
st.set_page_config( |
|
page_title="Hindi BPE Tokenizer", |
|
page_icon="🇮🇳", |
|
layout="wide", |
|
initial_sidebar_state="expanded" |
|
) |
|
|
|
|
|
st.markdown( |
|
""" |
|
<style> |
|
.stApp { |
|
background-color: #2E2E2E; /* Dark background */ |
|
color: #FFFFFF; /* White text for better contrast */ |
|
} |
|
.stButton { |
|
background-color: #4CAF50; /* Green button */ |
|
color: white; |
|
border-radius: 8px; /* Rounded corners */ |
|
padding: 10px 20px; /* Padding for buttons */ |
|
font-size: 16px; /* Larger font size */ |
|
} |
|
.stButton:hover { |
|
background-color: #45a049; /* Darker green on hover */ |
|
} |
|
.stTextInput, .stTextArea { |
|
background-color: #ffffff; /* White input fields */ |
|
border: 2px solid #4CAF50; /* Green border */ |
|
border-radius: 8px; /* Rounded corners */ |
|
padding: 10px; /* Padding for input fields */ |
|
font-size: 16px; /* Larger font size */ |
|
} |
|
.stHeader { |
|
color: #FFD700; /* Gold color for header text */ |
|
font-size: 28px; /* Larger header font size */ |
|
font-weight: bold; /* Bold header */ |
|
} |
|
.stMarkdown { |
|
color: #FFFFFF; /* White markdown text */ |
|
font-size: 16px; /* Larger markdown font size */ |
|
} |
|
.stTextInput:focus, .stTextArea:focus { |
|
border-color: #45a049; /* Change border color on focus */ |
|
box-shadow: 0 0 5px rgba(76, 175, 80, 0.5); /* Add shadow on focus */ |
|
} |
|
</style> |
|
""", |
|
unsafe_allow_html=True |
|
) |
|
|
|
st.title("Hindi BPE Tokenizer") |
|
st.markdown("A web interface for encoding and decoding Hindi text using BPE tokenization") |
|
|
|
|
|
try: |
|
tokenizer = load_hindi_tokenizer() |
|
except Exception as e: |
|
st.error(f"Error loading tokenizer: {e}") |
|
st.stop() |
|
|
|
|
|
encode_col, decode_col = st.columns(2) |
|
|
|
|
|
with encode_col: |
|
st.header("Encode Hindi Text") |
|
st.markdown("Convert Hindi text into token IDs") |
|
|
|
input_text = st.text_area( |
|
"Enter Hindi Text", |
|
placeholder="यहाँ हिंदी टेक्स्ट लिखें...", |
|
height=150, |
|
key="encode_input" |
|
) |
|
|
|
if st.button("Encode", key="encode_button"): |
|
if input_text.strip(): |
|
try: |
|
token_ids, tokens = encode_text(tokenizer, input_text) |
|
|
|
st.subheader("Results:") |
|
st.markdown("**Tokens:**") |
|
st.write(tokens) |
|
|
|
st.markdown("**Token IDs:**") |
|
st.write(token_ids) |
|
|
|
|
|
st.markdown("**Token IDs (comma-separated):**") |
|
st.code(", ".join(map(str, token_ids))) |
|
|
|
except Exception as e: |
|
st.error(f"Error during encoding: {e}") |
|
else: |
|
st.warning("Please enter some text to encode") |
|
|
|
|
|
with decode_col: |
|
st.header("Decode Token IDs") |
|
st.markdown("Convert token IDs back to Hindi text") |
|
|
|
input_ids = st.text_area( |
|
"Enter Token IDs (comma-separated)", |
|
placeholder="2197, 1024, 402, 7, 924...", |
|
height=150, |
|
key="decode_input" |
|
) |
|
|
|
if st.button("Decode", key="decode_button"): |
|
if input_ids.strip(): |
|
try: |
|
|
|
token_ids = [int(id.strip()) for id in input_ids.split(",")] |
|
|
|
decoded_text = decode_text(tokenizer, token_ids) |
|
|
|
st.subheader("Results:") |
|
st.markdown("**Decoded Text:**") |
|
st.write(decoded_text) |
|
|
|
|
|
st.text_area( |
|
"Decoded Text (copyable)", |
|
value=decoded_text, |
|
height=100, |
|
key="decoded_output" |
|
) |
|
|
|
except ValueError: |
|
st.error("Invalid input format. Please enter comma-separated numbers.") |
|
except Exception as e: |
|
st.error(f"Error during decoding: {e}") |
|
else: |
|
st.warning("Please enter token IDs to decode") |
|
|
|
|
|
st.markdown("---") |
|
st.markdown("### About the Tokenizer") |
|
|
|
info_col1, info_col2 = st.columns(2) |
|
|
|
with info_col1: |
|
st.markdown(""" |
|
**Tokenizer Details:** |
|
- Type: Byte Pair Encoding (BPE) |
|
- Vocabulary Size: 5000 tokens |
|
- Special Tokens: `<pad>`, `<unk>`, `<s>`, `</s>` |
|
- Minimum Token Frequency: 2 |
|
""") |
|
|
|
with info_col2: |
|
st.markdown(""" |
|
**Preprocessing:** |
|
- Retains Hindi Unicode (\\u0900-\\u097F) |
|
- Removes digits and special characters |
|
- Normalizes punctuation |
|
- Cleans whitespace |
|
""") |
|
|
|
if __name__ == "__main__": |
|
main() |