allanctan commited on
Commit
45eef34
·
1 Parent(s): 857a212

initial commit

Browse files
Files changed (5) hide show
  1. .gitignore +4 -0
  2. Dockerfile +26 -0
  3. chainlit.md +5 -0
  4. qa.py +97 -0
  5. requirements.txt +103 -0
.gitignore ADDED
@@ -0,0 +1,4 @@
 
 
 
 
 
1
+ .env
2
+ .chainlit/*
3
+ .files/*
4
+ __pycache__/*
Dockerfile ADDED
@@ -0,0 +1,26 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # Use an official Python runtime as a parent image
2
+ FROM python:3.12-slim
3
+
4
+ RUN useradd -m -u 1000 user
5
+ USER user
6
+
7
+ ENV HOME=/home/user \
8
+ PATH=/home/user/.local/bin:$PATH
9
+
10
+ # Set the working directory in the container
11
+ WORKDIR $HOME/app
12
+
13
+ # Copy the current directory contents into the container at /app
14
+ COPY --chown=user . $HOME/app
15
+ COPY ./requirements.txt $HOME/app/requirements.txt
16
+
17
+ # Install any needed dependencies specified in requirements.txt
18
+ RUN pip install -r requirements.txt
19
+
20
+ COPY . .
21
+
22
+ # Set environment variables
23
+ ENV PYTHONUNBUFFERED 1
24
+
25
+ # Command to run the app
26
+ CMD python -m chainlit run qa.py -h --host 0.0.0.0 --port ${PORT}
chainlit.md ADDED
@@ -0,0 +1,5 @@
 
 
 
 
 
 
1
+ # Welcome to Meta Filings Chat! 🚀🤖
2
+
3
+ Hi there! You can ask me anything about meta 10K filings for year 2023.
4
+
5
+ ### Please what while I initialize...
qa.py ADDED
@@ -0,0 +1,97 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import os
2
+ from typing import List
3
+
4
+ from langchain.embeddings.openai import OpenAIEmbeddings
5
+ from langchain.text_splitter import RecursiveCharacterTextSplitter
6
+ from langchain.vectorstores import Chroma
7
+ from langchain.chains import (
8
+ ConversationalRetrievalChain,
9
+ )
10
+ from langchain.chat_models import ChatOpenAI
11
+
12
+ from langchain.docstore.document import Document
13
+ from langchain.memory import ChatMessageHistory, ConversationBufferMemory
14
+ from langchain_community.document_loaders import PyMuPDFLoader
15
+ from langchain_community.vectorstores import Qdrant
16
+
17
+ import chainlit as cl
18
+
19
+ text_splitter = RecursiveCharacterTextSplitter(chunk_size=600, chunk_overlap=50)
20
+
21
+
22
+ @cl.on_chat_start
23
+ async def on_chat_start():
24
+ msg = cl.Message(content=f"Please wait... initializing.", disable_feedback=True)
25
+ await msg.send()
26
+
27
+ loader = PyMuPDFLoader(
28
+ "https://d18rn0p25nwr6d.cloudfront.net/CIK-0001326801/c7318154-f6ae-4866-89fa-f0c589f2ee3d.pdf",
29
+ )
30
+
31
+ documents = loader.load()
32
+ text=''
33
+ for doc in documents:
34
+ text+=doc.page_content
35
+
36
+ # Split the text into chunks
37
+ documents = text_splitter.split_documents(documents)
38
+
39
+ # Create a Chroma vector store
40
+ embeddings = OpenAIEmbeddings(model="text-embedding-3-small")
41
+
42
+ # Create a Chroma vector store
43
+ docsearch = await cl.make_async(Qdrant.from_documents)(
44
+ documents, embeddings, location=":memory:"
45
+ )
46
+
47
+ message_history = ChatMessageHistory()
48
+
49
+ memory = ConversationBufferMemory(
50
+ memory_key="chat_history",
51
+ output_key="answer",
52
+ chat_memory=message_history,
53
+ return_messages=True,
54
+ )
55
+
56
+ # Create a chain that uses the Chroma vector store
57
+ chain = ConversationalRetrievalChain.from_llm(
58
+ ChatOpenAI(model_name="gpt-3.5-turbo", temperature=0, streaming=True),
59
+ chain_type="stuff",
60
+ retriever=docsearch.as_retriever(),
61
+ memory=memory,
62
+ return_source_documents=True,
63
+ )
64
+
65
+ # Let the user know that the system is ready
66
+ msg.content = f"Initialization successful. You can now ask questions!"
67
+ await msg.update()
68
+
69
+ cl.user_session.set("chain", chain)
70
+
71
+
72
+ @cl.on_message
73
+ async def main(message: cl.Message):
74
+ chain = cl.user_session.get("chain") # type: ConversationalRetrievalChain
75
+ cb = cl.AsyncLangchainCallbackHandler()
76
+
77
+ res = await chain.acall(message.content, callbacks=[cb])
78
+ answer = res["answer"]
79
+ source_documents = res["source_documents"] # type: List[Document]
80
+
81
+ text_elements = [] # type: List[cl.Text]
82
+
83
+ if source_documents:
84
+ for source_idx, source_doc in enumerate(source_documents):
85
+ source_name = f"source_{source_idx}"
86
+ # Create the text element referenced in the message
87
+ text_elements.append(
88
+ cl.Text(content=source_doc.page_content, name=source_name)
89
+ )
90
+ source_names = [text_el.name for text_el in text_elements]
91
+
92
+ if source_names:
93
+ answer += f"\nSources: {', '.join(source_names)}"
94
+ else:
95
+ answer += "\nNo sources found"
96
+
97
+ await cl.Message(content=answer, elements=text_elements).send()
requirements.txt ADDED
@@ -0,0 +1,103 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ aiofiles==23.2.1
2
+ aiohttp==3.9.5
3
+ aiosignal==1.3.1
4
+ annotated-types==0.6.0
5
+ anyio==3.7.1
6
+ asyncer==0.0.2
7
+ attrs==23.2.0
8
+ bidict==0.23.1
9
+ certifi==2024.2.2
10
+ chainlit==1.0.505
11
+ charset-normalizer==3.3.2
12
+ chevron==0.14.0
13
+ click==8.1.7
14
+ dataclasses-json==0.5.14
15
+ Deprecated==1.2.14
16
+ distro==1.9.0
17
+ fastapi==0.110.2
18
+ fastapi-socketio==0.0.10
19
+ filetype==1.2.0
20
+ frozenlist==1.4.1
21
+ googleapis-common-protos==1.63.0
22
+ grpcio==1.62.2
23
+ grpcio-tools==1.62.2
24
+ h11==0.14.0
25
+ h2==4.1.0
26
+ hpack==4.0.0
27
+ httpcore==1.0.5
28
+ httpx==0.27.0
29
+ hyperframe==6.0.1
30
+ idna==3.7
31
+ importlib-metadata==7.0.0
32
+ jsonpatch==1.33
33
+ jsonpointer==2.4
34
+ langchain==0.1.16
35
+ langchain-community==0.0.34
36
+ langchain-core==0.1.46
37
+ langchain-openai==0.1.4
38
+ langchain-text-splitters==0.0.1
39
+ langchainhub==0.1.15
40
+ langsmith==0.1.51
41
+ Lazify==0.4.0
42
+ literalai==0.0.507
43
+ marshmallow==3.21.1
44
+ multidict==6.0.5
45
+ mypy-extensions==1.0.0
46
+ nest-asyncio==1.6.0
47
+ numpy==1.26.4
48
+ openai==1.23.6
49
+ opentelemetry-api==1.24.0
50
+ opentelemetry-exporter-otlp==1.24.0
51
+ opentelemetry-exporter-otlp-proto-common==1.24.0
52
+ opentelemetry-exporter-otlp-proto-grpc==1.24.0
53
+ opentelemetry-exporter-otlp-proto-http==1.24.0
54
+ opentelemetry-instrumentation==0.45b0
55
+ opentelemetry-proto==1.24.0
56
+ opentelemetry-sdk==1.24.0
57
+ opentelemetry-semantic-conventions==0.45b0
58
+ orjson==3.10.1
59
+ packaging==23.2
60
+ pandas==2.2.2
61
+ portalocker==2.8.2
62
+ protobuf==4.25.3
63
+ pydantic==2.7.1
64
+ pydantic_core==2.18.2
65
+ PyJWT==2.8.0
66
+ PyMuPDF==1.24.2
67
+ PyMuPDFb==1.24.1
68
+ python-dateutil==2.9.0.post0
69
+ python-dotenv==1.0.1
70
+ python-engineio==4.9.0
71
+ python-graphql-client==0.4.3
72
+ python-multipart==0.0.9
73
+ python-socketio==5.11.2
74
+ pytz==2024.1
75
+ PyYAML==6.0.1
76
+ qdrant-client==1.9.0
77
+ regex==2024.4.16
78
+ requests==2.31.0
79
+ setuptools==68.2.2
80
+ simple-websocket==1.0.0
81
+ six==1.16.0
82
+ sniffio==1.3.1
83
+ SQLAlchemy==2.0.29
84
+ starlette==0.37.2
85
+ syncer==2.0.3
86
+ tenacity==8.2.3
87
+ tiktoken==0.6.0
88
+ tomli==2.0.1
89
+ tqdm==4.66.2
90
+ types-requests==2.31.0.20240406
91
+ typing-inspect==0.9.0
92
+ typing_extensions==4.11.0
93
+ tzdata==2024.1
94
+ uptrace==1.24.0
95
+ urllib3==2.2.1
96
+ uvicorn==0.25.0
97
+ watchfiles==0.20.0
98
+ websockets==12.0
99
+ wheel==0.41.2
100
+ wrapt==1.16.0
101
+ wsproto==1.2.0
102
+ yarl==1.9.4
103
+ zipp==3.18.1