erav3-s11-hindi-tokenizer / use_tokenizer.py
MilindChawre's picture
Adding hindi BPE tokenizer
da971a5
from pathlib import Path
from hindi_tokenizer import load_tokenizer, encode_text, decode_text
def main():
# Load the trained tokenizer
output_dir = Path("output")
config_path = output_dir / "hindi_encoder.json"
if not config_path.exists():
print("Error: Tokenizer configuration not found! Please train the tokenizer first.")
return
tokenizer = load_tokenizer(str(config_path))
# Interactive loop
print("Hindi Text Encoder/Decoder (type 'quit' to exit)")
print("-" * 50)
while True:
text = input("\nEnter Hindi text to encode/decode: ")
if text.lower() == 'quit':
break
if not text.strip():
continue
# Encode the text
token_ids, tokens = encode_text(tokenizer, text)
print("\nEncoding:")
print(f"Tokens: {tokens}")
print(f"Token IDs: {token_ids}")
# Decode back
decoded_text = decode_text(tokenizer, token_ids)
print("\nDecoding:")
print(f"Text: {decoded_text}")
if __name__ == "__main__":
main()