wikipedia-en / app.py
charlesdedampierre's picture
display a sample
84952b3
import streamlit as st
import pandas as pd
st.sidebar.image("images/logo.png", use_column_width=True)
st.sidebar.write("Bunka Summarizes & Visualizes Information as Maps using LLMs.")
st.sidebar.title("Github Page")
st.sidebar.write(
"Have a look at the following package on GitHub: https://github.com/charlesdedampierre/BunkaTopics"
)
st.sidebar.title("Dataset")
st.sidebar.write(
"We used a subset of Wikipedia dataset: https://huggingface.co/datasets/wikimedia/wikipedia"
)
st.title("How to understand large textual datasets?")
st.info(
"We randomly sampled 40,000 articles from the English subset 20231101.en of the Wikipedia dataset. We then took the first 500 words of each articles in order to generate an abstract that will be used for topic modeling. Here is a sample:"
)
df = pd.read_csv("data/data_sample_wikipedia.csv", index_col=[0])
df = df[["text", "url"]]
df = df.head(100)
st.dataframe(df, use_container_width=True)
st.title("Inside the Wikipedia dataset")
st.image(
"images/map.png",
use_column_width=True,
caption="This mapping can be done for each subset of the Wikipedia dataset, and the articles can be selected on a topic basis through the python package, allowing to filter and curate the data.",
)
st.markdown(
'<div align="center"><a href="https://charlesdedampierre.github.io/wikipedia-bunka-map"><h2 style="color: #0066ff;">Full Interactive Map</h2></a></div>',
unsafe_allow_html=True,
)
st.info(
"This interactive map explores each datapoint to get a more precise overview of the contents (it takes 10 seconds to load)"
)
st.title("Some insights by territory")
df_info = pd.read_csv("data/topics_info.csv", index_col=[0])
df_info = df_info[["name", "size", "percent"]]
df_info["percent"] = round(df_info["percent"] * 100, 3)
df_info["percent"] = df_info["percent"].apply(lambda x: str(int(x)) + "%")
st.dataframe(df_info, use_container_width=True)
st.title("Bunka Exploration Engine")
st.image(
"images/pipeline.png",
use_column_width=True,
)