Spaces:

Svngoku
/

african-history-smolagent-rag

Running

App Files Files Community

african-history-smolagent-rag / app.py

Svngoku

Update app.py

6690b0b verified 10 days ago

raw

history blame contribute delete

7.13 kB

	import streamlit as st
	from smolagents import Tool, CodeAgent, HfApiModel
	from langchain.text_splitter import RecursiveCharacterTextSplitter, MarkdownTextSplitter
	from langchain_community.retrievers import BM25Retriever
	from langchain.docstore.document import Document
	from datasets import load_dataset, concatenate_datasets

	st.set_page_config(
	page_title="African History Search Engine",
	page_icon="🌍",
	layout="wide"
	)

	class RetrieverTool(Tool):
	name = "retriever"
	description = "Uses BM25 search to retrieve relevant African historical documentation"
	inputs = {
	"query": {
	"type": "string",
	"description": "The historical query in affirmative form rather than a question"
	}
	}
	output_type = "string"

	def __init__(self, docs, k1=1.5, b=0.75, **kwargs):
	super().__init__(**kwargs)
	self.retriever = BM25Retriever.from_documents(
	docs,
	k=12,
	k1=k1,
	b=b
	)
	self.docs = docs
	self.avg_doc_length = sum(len(doc.page_content.split()) for doc in docs) / len(docs)

	def forward(self, query: str) -> str: # Matches exactly with inputs
	# Preprocess query
	query = self._preprocess_query(query)

	# Retrieve documents
	docs = self.retriever.get_relevant_documents(query)

	# Format response
	main_response = "Retrieved documents (ranked by relevance):\n\n"

	for i, doc in enumerate(docs, 1):
	doc_length = len(doc.page_content.split())
	length_factor = doc_length / self.avg_doc_length

	main_response += f"Document {i} (Length Factor: {length_factor:.2f})\n"
	main_response += f"{doc.page_content}\n\n"

	if doc.metadata:
	main_response += f"Metadata: {doc.metadata}\n"
	main_response += "---\n\n"

	return main_response

	def _preprocess_query(self, query: str) -> str:
	question_words = ["what", "when", "where", "who", "why", "how"]
	query_terms = query.lower().split()
	if query_terms[0] in question_words:
	query_terms = query_terms[1:]
	return " ".join(query_terms)

	# Process documents
	def prepare_docs(documents):
	text_splitter = MarkdownTextSplitter(
	chunk_size=1000,
	chunk_overlap=200
	)
	return text_splitter.split_documents(documents)

	# Initialize agent
	def create_rag_agent(processed_docs):
	retriever_tool = RetrieverTool(processed_docs)
	return CodeAgent(
	tools=[retriever_tool],
	model=HfApiModel(),
	verbose=True
	)

	def format_search_results(results: str):
	"""Format the search results into main content and sources sections"""
	if "### 📚 Sources:" in results:
	main_content, sources = results.split("### 📚 Sources:")

	# Create two columns with adjusted ratios
	col1, col2 = st.columns([3, 2])

	with col1:
	st.markdown("### 📖 Main Findings")
	st.markdown(main_content)

	with col2:
	st.markdown("### 📚 Sources")
	st.markdown(sources, unsafe_allow_html=True)
	else:
	st.markdown(results)

	@st.cache_resource
	def get_agent():
	"""Single function to handle data loading, processing, and agent creation"""
	# Load dataset
	dataset = load_dataset("Svngoku/African-History-Extra-11-30-24")
	train_docs = dataset["train"]
	test_docs = dataset["test"]
	source_docs = concatenate_datasets([train_docs, test_docs])

	# Create documents
	documents = [
	Document(
	page_content=item['content'],
	metadata={
	"source": item['url'],
	"title": item['title'],
	"description": item['description'],
	"published_time": item['publishedTime']
	}
	)
	for item in source_docs
	]

	# Process documents
	text_splitter = RecursiveCharacterTextSplitter(
	chunk_size=1000,
	chunk_overlap=500,
	add_start_index=True,
	strip_whitespace=True,
	)
	processed_docs = text_splitter.split_documents(documents)

	# Create and return agent
	retriever_tool = RetrieverTool(processed_docs)
	return CodeAgent(
	tools=[retriever_tool],
	model=HfApiModel(),
	)

	# Streamlit UI
	st.title("🌍 African History Search Engine")
	st.markdown("""
	This search engine uses advanced AI to help you explore African history.
	It provides detailed, sourced information from a curated database of historical documents.
	""")

	# Initialize agent
	if 'agent' not in st.session_state:
	with st.spinner("Loading historical database..."):
	st.session_state.agent = get_agent()

	# Search interface
	search_query = st.text_input(
	"🔍 Search African History",
	placeholder="E.g., Tell me about the Kingdom of Kush",
	help="Enter any question about African history"
	)

	# Advanced search options
	with st.expander("Advanced Search Options"):
	search_type = st.radio(
	"Search Type",
	["General Query", "Specific Time Period", "Geographic Region"],
	help="Select the type of search you want to perform"
	)

	if search_type == "Specific Time Period":
	search_query = f"Focus on the time period: {search_query}"
	elif search_type == "Geographic Region":
	search_query = f"Focus on the region of: {search_query}"

	# Search button
	if st.button("Search", type="primary"):
	if search_query:
	with st.spinner("Searching historical records..."):
	try:
	results = st.session_state.agent.run(search_query)

	# Use the formatter to display results
	format_search_results(results)

	# Add methodology note
	st.markdown("---")
	st.info("""
	💡 How to read the results:
	- Main findings are summarized on the left
	- Source references are numbered [Source X]
	- Click on source details on the right to expand
	- Follow the links to read the original articles
	""")

	except Exception as e:
	st.error(f"An error occurred during the search: {e}")
	else:
	st.warning("Please enter a search query to begin.")

	# Sidebar with additional information
	with st.sidebar:
	st.markdown("### About This Search Engine")
	st.markdown("""
	This search engine specializes in African history, providing:
	- 📚 Detailed historical information
	- 🔍 Source verification
	- 🌍 Geographic context
	- ⏳ Historical timeline context
	""")

	st.markdown("### Data Sources")
	st.markdown("Our database includes information from various historical documents, "
	"academic papers, and verified historical records.")

	# Footer
	st.markdown("---")
	st.caption("Powered by SmolAgents, RAG, and African History Dataset")