# PDF QA RAG App

## Importing necessary libraries 

In [1]:
from langchain_community.document_loaders import PyPDFLoader
from langchain.text_splitter import RecursiveCharacterTextSplitter

from langchain.prompts import PromptTemplate 
from langchain_core.output_parsers import StrOutputParser 
from operator import itemgetter

from pinecone import Pinecone, ServerlessSpec

## Defining helper functions

In [2]:
def load_split_file(file_path):
 loader = PyPDFLoader(file_path)
 pages = loader.load_and_split()

 
 text_splitter = RecursiveCharacterTextSplitter(chunk_size=200, chunk_overlap=10)
 docs = text_splitter.split_documents(pages)

 return docs



In [3]:
def create_index(index_name, PINECONE_API_KEY):
 
 pc = Pinecone(api_key=PINECONE_API_KEY)

 if index_name in pc.list_indexes().names():
 pc.delete_index(index_name) # To avoid any conflicts in retrieval
 pc.create_index(
 name=index_name, 
 dimension=384, 
 metric='cosine',
 spec=ServerlessSpec(
 cloud="aws",
 region="us-east-1"
 )
 )

 return index_name



In [4]:
def final_response(index, question, model):
 retriever = index.as_retriever()

 parser = StrOutputParser()

 chain = model | parser 

 template = """
 You must provide an answer based strictly on the context below. The answer is highly likely to be found within the given context, so analyze it thoroughly before responding. Only if there is absolutely no relevant information, respond with "I don't know".

 Context: {context}

 Question: {question}
 """


 prompt = PromptTemplate.from_template(template)
 prompt.format(context="Here is some context", question="Here is a question")

 chain = (
 {
 "context": itemgetter("question") | retriever,
 "question": itemgetter("question"),
 }
 | prompt
 | model
 | parser
 )
 matching_results=index.similarity_search(question,k=2)

 return f"Answer: {chain.invoke({'question': question})}", matching_results



In [None]:
import gradio as gr
from langchain_community.embeddings import SentenceTransformerEmbeddings
from langchain_community.vectorstores import Pinecone as LangchainPinecone
from utilis import load_split_file, create_index, final_response
from langchain_mistralai.chat_models import ChatMistralAI

import os
import shutil
from dotenv import load_dotenv

In [6]:
load_dotenv()
PINECONE_API_KEY = os.getenv("PINECONE_API_KEY")
MISTRAL_API_KEY = os.getenv("MISTRAL_API_KEY")
SAVE_DIR = "/RAG-APP/data.pdf"


In [None]:
embeddings = SentenceTransformerEmbeddings(model_name="all-MiniLM-L6-v2")

model = ChatMistralAI(mistral_api_key=MISTRAL_API_KEY)
pinecone_index = "index"
index_name = create_index(pinecone_index, PINECONE_API_KEY)

In [None]:
file_path = "data/last lesson.pdf"
docs = load_split_file(file_path)

In [None]:
index = LangchainPinecone.from_documents(docs, embeddings, index_name=index_name)
question = "What data does google collects?"
matching_results=index.similarity_search(question,k=2)

answer = final_response(index, question, model)

print(f"{answer}\n\n{matching_results}")