Spaces:
Sleeping
Sleeping
initial commit
Browse files- .gitignore +4 -0
- Dockerfile +26 -0
- chainlit.md +5 -0
- qa.py +97 -0
- requirements.txt +103 -0
.gitignore
ADDED
@@ -0,0 +1,4 @@
|
|
|
|
|
|
|
|
|
|
|
1 |
+
.env
|
2 |
+
.chainlit/*
|
3 |
+
.files/*
|
4 |
+
__pycache__/*
|
Dockerfile
ADDED
@@ -0,0 +1,26 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
# Use an official Python runtime as a parent image
|
2 |
+
FROM python:3.12-slim
|
3 |
+
|
4 |
+
RUN useradd -m -u 1000 user
|
5 |
+
USER user
|
6 |
+
|
7 |
+
ENV HOME=/home/user \
|
8 |
+
PATH=/home/user/.local/bin:$PATH
|
9 |
+
|
10 |
+
# Set the working directory in the container
|
11 |
+
WORKDIR $HOME/app
|
12 |
+
|
13 |
+
# Copy the current directory contents into the container at /app
|
14 |
+
COPY --chown=user . $HOME/app
|
15 |
+
COPY ./requirements.txt $HOME/app/requirements.txt
|
16 |
+
|
17 |
+
# Install any needed dependencies specified in requirements.txt
|
18 |
+
RUN pip install -r requirements.txt
|
19 |
+
|
20 |
+
COPY . .
|
21 |
+
|
22 |
+
# Set environment variables
|
23 |
+
ENV PYTHONUNBUFFERED 1
|
24 |
+
|
25 |
+
# Command to run the app
|
26 |
+
CMD python -m chainlit run qa.py -h --host 0.0.0.0 --port ${PORT}
|
chainlit.md
ADDED
@@ -0,0 +1,5 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
# Welcome to Meta Filings Chat! 🚀🤖
|
2 |
+
|
3 |
+
Hi there! You can ask me anything about meta 10K filings for year 2023.
|
4 |
+
|
5 |
+
### Please what while I initialize...
|
qa.py
ADDED
@@ -0,0 +1,97 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import os
|
2 |
+
from typing import List
|
3 |
+
|
4 |
+
from langchain.embeddings.openai import OpenAIEmbeddings
|
5 |
+
from langchain.text_splitter import RecursiveCharacterTextSplitter
|
6 |
+
from langchain.vectorstores import Chroma
|
7 |
+
from langchain.chains import (
|
8 |
+
ConversationalRetrievalChain,
|
9 |
+
)
|
10 |
+
from langchain.chat_models import ChatOpenAI
|
11 |
+
|
12 |
+
from langchain.docstore.document import Document
|
13 |
+
from langchain.memory import ChatMessageHistory, ConversationBufferMemory
|
14 |
+
from langchain_community.document_loaders import PyMuPDFLoader
|
15 |
+
from langchain_community.vectorstores import Qdrant
|
16 |
+
|
17 |
+
import chainlit as cl
|
18 |
+
|
19 |
+
text_splitter = RecursiveCharacterTextSplitter(chunk_size=600, chunk_overlap=50)
|
20 |
+
|
21 |
+
|
22 |
+
@cl.on_chat_start
|
23 |
+
async def on_chat_start():
|
24 |
+
msg = cl.Message(content=f"Please wait... initializing.", disable_feedback=True)
|
25 |
+
await msg.send()
|
26 |
+
|
27 |
+
loader = PyMuPDFLoader(
|
28 |
+
"https://d18rn0p25nwr6d.cloudfront.net/CIK-0001326801/c7318154-f6ae-4866-89fa-f0c589f2ee3d.pdf",
|
29 |
+
)
|
30 |
+
|
31 |
+
documents = loader.load()
|
32 |
+
text=''
|
33 |
+
for doc in documents:
|
34 |
+
text+=doc.page_content
|
35 |
+
|
36 |
+
# Split the text into chunks
|
37 |
+
documents = text_splitter.split_documents(documents)
|
38 |
+
|
39 |
+
# Create a Chroma vector store
|
40 |
+
embeddings = OpenAIEmbeddings(model="text-embedding-3-small")
|
41 |
+
|
42 |
+
# Create a Chroma vector store
|
43 |
+
docsearch = await cl.make_async(Qdrant.from_documents)(
|
44 |
+
documents, embeddings, location=":memory:"
|
45 |
+
)
|
46 |
+
|
47 |
+
message_history = ChatMessageHistory()
|
48 |
+
|
49 |
+
memory = ConversationBufferMemory(
|
50 |
+
memory_key="chat_history",
|
51 |
+
output_key="answer",
|
52 |
+
chat_memory=message_history,
|
53 |
+
return_messages=True,
|
54 |
+
)
|
55 |
+
|
56 |
+
# Create a chain that uses the Chroma vector store
|
57 |
+
chain = ConversationalRetrievalChain.from_llm(
|
58 |
+
ChatOpenAI(model_name="gpt-3.5-turbo", temperature=0, streaming=True),
|
59 |
+
chain_type="stuff",
|
60 |
+
retriever=docsearch.as_retriever(),
|
61 |
+
memory=memory,
|
62 |
+
return_source_documents=True,
|
63 |
+
)
|
64 |
+
|
65 |
+
# Let the user know that the system is ready
|
66 |
+
msg.content = f"Initialization successful. You can now ask questions!"
|
67 |
+
await msg.update()
|
68 |
+
|
69 |
+
cl.user_session.set("chain", chain)
|
70 |
+
|
71 |
+
|
72 |
+
@cl.on_message
|
73 |
+
async def main(message: cl.Message):
|
74 |
+
chain = cl.user_session.get("chain") # type: ConversationalRetrievalChain
|
75 |
+
cb = cl.AsyncLangchainCallbackHandler()
|
76 |
+
|
77 |
+
res = await chain.acall(message.content, callbacks=[cb])
|
78 |
+
answer = res["answer"]
|
79 |
+
source_documents = res["source_documents"] # type: List[Document]
|
80 |
+
|
81 |
+
text_elements = [] # type: List[cl.Text]
|
82 |
+
|
83 |
+
if source_documents:
|
84 |
+
for source_idx, source_doc in enumerate(source_documents):
|
85 |
+
source_name = f"source_{source_idx}"
|
86 |
+
# Create the text element referenced in the message
|
87 |
+
text_elements.append(
|
88 |
+
cl.Text(content=source_doc.page_content, name=source_name)
|
89 |
+
)
|
90 |
+
source_names = [text_el.name for text_el in text_elements]
|
91 |
+
|
92 |
+
if source_names:
|
93 |
+
answer += f"\nSources: {', '.join(source_names)}"
|
94 |
+
else:
|
95 |
+
answer += "\nNo sources found"
|
96 |
+
|
97 |
+
await cl.Message(content=answer, elements=text_elements).send()
|
requirements.txt
ADDED
@@ -0,0 +1,103 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
aiofiles==23.2.1
|
2 |
+
aiohttp==3.9.5
|
3 |
+
aiosignal==1.3.1
|
4 |
+
annotated-types==0.6.0
|
5 |
+
anyio==3.7.1
|
6 |
+
asyncer==0.0.2
|
7 |
+
attrs==23.2.0
|
8 |
+
bidict==0.23.1
|
9 |
+
certifi==2024.2.2
|
10 |
+
chainlit==1.0.505
|
11 |
+
charset-normalizer==3.3.2
|
12 |
+
chevron==0.14.0
|
13 |
+
click==8.1.7
|
14 |
+
dataclasses-json==0.5.14
|
15 |
+
Deprecated==1.2.14
|
16 |
+
distro==1.9.0
|
17 |
+
fastapi==0.110.2
|
18 |
+
fastapi-socketio==0.0.10
|
19 |
+
filetype==1.2.0
|
20 |
+
frozenlist==1.4.1
|
21 |
+
googleapis-common-protos==1.63.0
|
22 |
+
grpcio==1.62.2
|
23 |
+
grpcio-tools==1.62.2
|
24 |
+
h11==0.14.0
|
25 |
+
h2==4.1.0
|
26 |
+
hpack==4.0.0
|
27 |
+
httpcore==1.0.5
|
28 |
+
httpx==0.27.0
|
29 |
+
hyperframe==6.0.1
|
30 |
+
idna==3.7
|
31 |
+
importlib-metadata==7.0.0
|
32 |
+
jsonpatch==1.33
|
33 |
+
jsonpointer==2.4
|
34 |
+
langchain==0.1.16
|
35 |
+
langchain-community==0.0.34
|
36 |
+
langchain-core==0.1.46
|
37 |
+
langchain-openai==0.1.4
|
38 |
+
langchain-text-splitters==0.0.1
|
39 |
+
langchainhub==0.1.15
|
40 |
+
langsmith==0.1.51
|
41 |
+
Lazify==0.4.0
|
42 |
+
literalai==0.0.507
|
43 |
+
marshmallow==3.21.1
|
44 |
+
multidict==6.0.5
|
45 |
+
mypy-extensions==1.0.0
|
46 |
+
nest-asyncio==1.6.0
|
47 |
+
numpy==1.26.4
|
48 |
+
openai==1.23.6
|
49 |
+
opentelemetry-api==1.24.0
|
50 |
+
opentelemetry-exporter-otlp==1.24.0
|
51 |
+
opentelemetry-exporter-otlp-proto-common==1.24.0
|
52 |
+
opentelemetry-exporter-otlp-proto-grpc==1.24.0
|
53 |
+
opentelemetry-exporter-otlp-proto-http==1.24.0
|
54 |
+
opentelemetry-instrumentation==0.45b0
|
55 |
+
opentelemetry-proto==1.24.0
|
56 |
+
opentelemetry-sdk==1.24.0
|
57 |
+
opentelemetry-semantic-conventions==0.45b0
|
58 |
+
orjson==3.10.1
|
59 |
+
packaging==23.2
|
60 |
+
pandas==2.2.2
|
61 |
+
portalocker==2.8.2
|
62 |
+
protobuf==4.25.3
|
63 |
+
pydantic==2.7.1
|
64 |
+
pydantic_core==2.18.2
|
65 |
+
PyJWT==2.8.0
|
66 |
+
PyMuPDF==1.24.2
|
67 |
+
PyMuPDFb==1.24.1
|
68 |
+
python-dateutil==2.9.0.post0
|
69 |
+
python-dotenv==1.0.1
|
70 |
+
python-engineio==4.9.0
|
71 |
+
python-graphql-client==0.4.3
|
72 |
+
python-multipart==0.0.9
|
73 |
+
python-socketio==5.11.2
|
74 |
+
pytz==2024.1
|
75 |
+
PyYAML==6.0.1
|
76 |
+
qdrant-client==1.9.0
|
77 |
+
regex==2024.4.16
|
78 |
+
requests==2.31.0
|
79 |
+
setuptools==68.2.2
|
80 |
+
simple-websocket==1.0.0
|
81 |
+
six==1.16.0
|
82 |
+
sniffio==1.3.1
|
83 |
+
SQLAlchemy==2.0.29
|
84 |
+
starlette==0.37.2
|
85 |
+
syncer==2.0.3
|
86 |
+
tenacity==8.2.3
|
87 |
+
tiktoken==0.6.0
|
88 |
+
tomli==2.0.1
|
89 |
+
tqdm==4.66.2
|
90 |
+
types-requests==2.31.0.20240406
|
91 |
+
typing-inspect==0.9.0
|
92 |
+
typing_extensions==4.11.0
|
93 |
+
tzdata==2024.1
|
94 |
+
uptrace==1.24.0
|
95 |
+
urllib3==2.2.1
|
96 |
+
uvicorn==0.25.0
|
97 |
+
watchfiles==0.20.0
|
98 |
+
websockets==12.0
|
99 |
+
wheel==0.41.2
|
100 |
+
wrapt==1.16.0
|
101 |
+
wsproto==1.2.0
|
102 |
+
yarl==1.9.4
|
103 |
+
zipp==3.18.1
|