|
from transformers import pipeline, AutoTokenizer |
|
from optimum.onnxruntime import ORTModelForTokenClassification |
|
import re |
|
import gradio as gr |
|
|
|
|
|
CATEGORIES = { |
|
"Need": { |
|
"Housing": ["nha", "thue", "sua nha"], |
|
"Groceries": ["thuc pham", "rau cu", "sieu thi"], |
|
}, |
|
"Want": { |
|
"Entertainment": ["phim", "karaoke", "game", "do choi"], |
|
"Dining Out": ["cafe", "nha hang", "tra sua"], |
|
}, |
|
"Saving/Investment": { |
|
"Savings": ["quy tiet kiem", "dau tu", "tai san"], |
|
}, |
|
} |
|
|
|
|
|
def normalize_vietnamese(text): |
|
return re.sub( |
|
r'[àáạảãâầấậẩẫăằắặẳẵèéẹẻẽêềếệểễìíịỉĩòóọỏõôồốộổỗơờớợởỡùúụủũưừứựửữỳýỵỷỹđ]', '', text |
|
).replace("đ", "d") |
|
|
|
|
|
model_name = "distilbert-base-multilingual-cased" |
|
tokenizer = AutoTokenizer.from_pretrained(model_name) |
|
quantized_model = ORTModelForTokenClassification.from_pretrained(model_name) |
|
|
|
|
|
ner_model = pipeline("ner", model=quantized_model, tokenizer=tokenizer, aggregation_strategy="simple") |
|
|
|
|
|
def classify_and_extract(user_input): |
|
normalized_input = normalize_vietnamese(user_input.lower()) |
|
amount = re.search(r"\d+", normalized_input) |
|
amount = amount.group(0) if amount else "Unknown" |
|
|
|
|
|
for main_cat, subcategories in CATEGORIES.items(): |
|
for sub_cat, keywords in subcategories.items(): |
|
if any(keyword in normalized_input for keyword in keywords): |
|
return { |
|
"Main Category": main_cat, |
|
"Sub Category": sub_cat, |
|
"Amount": amount, |
|
"NER Entities": [], |
|
} |
|
|
|
|
|
ner_results = ner_model(user_input) |
|
return { |
|
"Main Category": "Uncategorized", |
|
"Sub Category": "Unknown", |
|
"Amount": amount, |
|
"NER Entities": ner_results, |
|
} |
|
|
|
|
|
def process_user_input(user_input): |
|
result = classify_and_extract(user_input) |
|
return ( |
|
f"Main Category: {result['Main Category']}\n" |
|
f"Sub Category: {result['Sub Category']}\n" |
|
f"Amount: {result['Amount']}\n" |
|
f"Entities: {result['NER Entities']}" |
|
) |
|
|
|
iface = gr.Interface( |
|
fn=process_user_input, |
|
inputs="text", |
|
outputs="text", |
|
title="Expenditure Classifier", |
|
description="Classify and categorize spending." |
|
) |
|
|
|
iface.launch(share=True) |
|
|