Spend / app.py
Meomap's picture
Update app.py
9839b62 verified
from transformers import pipeline, AutoTokenizer
from optimum.onnxruntime import ORTModelForTokenClassification
import re
import gradio as gr
# Define categories and keywords
CATEGORIES = {
"Need": {
"Housing": ["nha", "thue", "sua nha"],
"Groceries": ["thuc pham", "rau cu", "sieu thi"],
},
"Want": {
"Entertainment": ["phim", "karaoke", "game", "do choi"],
"Dining Out": ["cafe", "nha hang", "tra sua"],
},
"Saving/Investment": {
"Savings": ["quy tiet kiem", "dau tu", "tai san"],
},
}
# Normalize Vietnamese text
def normalize_vietnamese(text):
return re.sub(
r'[àáạảãâầấậẩẫăằắặẳẵèéẹẻẽêềếệểễìíịỉĩòóọỏõôồốộổỗơờớợởỡùúụủũưừứựửữỳýỵỷỹđ]', '', text
).replace("đ", "d")
# Load tokenizer and quantized model
model_name = "distilbert-base-multilingual-cased"
tokenizer = AutoTokenizer.from_pretrained(model_name)
quantized_model = ORTModelForTokenClassification.from_pretrained(model_name)
# Create NER pipeline
ner_model = pipeline("ner", model=quantized_model, tokenizer=tokenizer, aggregation_strategy="simple")
# Classify input
def classify_and_extract(user_input):
normalized_input = normalize_vietnamese(user_input.lower())
amount = re.search(r"\d+", normalized_input)
amount = amount.group(0) if amount else "Unknown"
# Rule-based matching
for main_cat, subcategories in CATEGORIES.items():
for sub_cat, keywords in subcategories.items():
if any(keyword in normalized_input for keyword in keywords):
return {
"Main Category": main_cat,
"Sub Category": sub_cat,
"Amount": amount,
"NER Entities": [],
}
# Fallback to NER model
ner_results = ner_model(user_input)
return {
"Main Category": "Uncategorized",
"Sub Category": "Unknown",
"Amount": amount,
"NER Entities": ner_results,
}
# Gradio app
def process_user_input(user_input):
result = classify_and_extract(user_input)
return (
f"Main Category: {result['Main Category']}\n"
f"Sub Category: {result['Sub Category']}\n"
f"Amount: {result['Amount']}\n"
f"Entities: {result['NER Entities']}"
)
iface = gr.Interface(
fn=process_user_input,
inputs="text",
outputs="text",
title="Expenditure Classifier",
description="Classify and categorize spending."
)
iface.launch(share=True)