| import os | |
| from langchain import FAISS, OpenAI, HuggingFaceHub, Cohere, PromptTemplate | |
| from langchain.chains import RetrievalQA, ConversationalRetrievalChain | |
| from langchain.embeddings import OpenAIEmbeddings, HuggingFaceEmbeddings, CohereEmbeddings | |
| from langchain.memory import ConversationBufferMemory | |
| from langchain.schema import Document | |
| from langchain.text_splitter import RecursiveCharacterTextSplitter, CharacterTextSplitter, NLTKTextSplitter, \ | |
| SpacyTextSplitter | |
| from langchain.vectorstores import Chroma, ElasticVectorSearch | |
| from pypdf import PdfReader | |
| from schema import EmbeddingTypes, IndexerType, TransformType, BotType | |
| class QnASystem: | |
| def read_and_load_pdf(self, f_data): | |
| pdf_data = PdfReader(f_data) | |
| documents = [] | |
| for idx, page in enumerate(pdf_data.pages): | |
| documents.append(Document(page_content=page.extract_text(), | |
| metadata={"page_no": idx, "source": f_data.name})) | |
| self.documents = documents | |
| def document_transformer(self, transform_type: TransformType): | |
| match transform_type: | |
| case TransformType.CharacterTransform: | |
| t_type = CharacterTextSplitter(chunk_size=1000, chunk_overlap=20) | |
| case TransformType.RecursiveTransform: | |
| t_type = RecursiveCharacterTextSplitter(chunk_size=1000, chunk_overlap=20) | |
| case TransformType.NLTKTransform: | |
| t_type = NLTKTextSplitter() | |
| case TransformType.SpacyTransform: | |
| t_type = SpacyTextSplitter() | |
| case _: | |
| raise IndexError("Invalid Transformer Type") | |
| self.transformed_documents = t_type.split_documents(documents=self.documents) | |
| def generate_embeddings(self, embedding_type: EmbeddingTypes = EmbeddingTypes.OPENAI, | |
| indexer_type: IndexerType = IndexerType.FAISS, **kwargs): | |
| temperature = kwargs.get("temperature", 0) | |
| max_tokens = kwargs.get("max_tokens", 512) | |
| match embedding_type: | |
| case EmbeddingTypes.OPENAI: | |
| os.environ["OPENAI_API_KEY"] = kwargs.get("api_key") or os.getenv("OPENAI_API_KEY") | |
| embeddings = OpenAIEmbeddings() | |
| llm = OpenAI(temperature=temperature, max_tokens=max_tokens) | |
| case EmbeddingTypes.HUGGING_FACE: | |
| embeddings = HuggingFaceEmbeddings(model_name=kwargs.get("model_name")) | |
| llm = HuggingFaceHub(repo_id=kwargs.get("model_name"), | |
| model_kwargs={"temperature": temperature, "max_tokens": max_tokens}) | |
| case EmbeddingTypes.COHERE: | |
| embeddings = CohereEmbeddings(model=kwargs.get("model_name"), cohere_api_key=kwargs.get("api_key")) | |
| llm = Cohere(model=kwargs.get("model_name"), cohere_api_key=kwargs.get("api_key"), | |
| model_kwargs={"temperature": temperature, | |
| "max_tokens": max_tokens}) | |
| case _: | |
| raise IndexError("Invalid Embedding Type") | |
| match indexer_type: | |
| case IndexerType.FAISS: | |
| indexer = FAISS | |
| case IndexerType.CHROMA: | |
| indexer = Chroma() | |
| case IndexerType.ELASTICSEARCH: | |
| indexer = ElasticVectorSearch(elasticsearch_url=kwargs.get("elasticsearch_url")) | |
| case _: | |
| raise IndexError("Invalid Indexer Function") | |
| self.llm = llm | |
| self.indexer = indexer | |
| self.vector_store = indexer.from_documents(documents=self.transformed_documents, embedding=embeddings) | |
| def get_retriever(self, search_type="similarity", top_k=5, **kwargs): | |
| retriever = self.vector_store.as_retriever(search_type=search_type, search_kwargs={"k": top_k}) | |
| self.retriever = retriever | |
| def get_prompt(self, bot_type: BotType, **kwargs): | |
| match bot_type: | |
| case BotType.qna: | |
| prompt = """ | |
| You are a smart and helpful AI assistant, who answer the question given context | |
| {context} | |
| Question: {question} | |
| """ | |
| case BotType.conversational: | |
| prompt = """ | |
| Given the following conversation and a follow up question, | |
| rephrase the follow up question to be a standalone question, in its original language. | |
| \nChat History:\n{chat_history}\nFollow Up Input: {question}\nStandalone question: | |
| """ | |
| return PromptTemplate(input_variables=["context", "question", "chat_history"], template=prompt) | |
| def build_qa(self, qa_type: BotType, chain_type="stuff", | |
| return_documents: bool = True, **kwargs): | |
| match qa_type: | |
| case BotType.qna: | |
| self.chain = RetrievalQA.from_chain_type(llm=self.llm, retriever=self.retriever, chain_type=chain_type, | |
| return_source_documents=return_documents, verbose=True) | |
| case BotType.conversational: | |
| self.memory = ConversationBufferMemory(memory_key="chat_history", return_messages=True, | |
| output_key="answer") | |
| self.chain = ConversationalRetrievalChain.from_llm(llm=self.llm, retriever=self.retriever, | |
| chain_type=chain_type, | |
| return_source_documents=return_documents, | |
| memory=self.memory, verbose=True) | |
| case _: | |
| raise IndexError("Invalid QA Type") | |
| def ask_question(self, query): | |
| if type(self.chain) == RetrievalQA: | |
| data = {"query": query} | |
| else: | |
| data = {"question": query} | |
| return self.chain(data) | |
| def build_chain(self, transform_type, embedding_type, indexer_type, **kwargs): | |
| if hasattr(self, "llm"): | |
| return self.chain | |
| self.document_transformer(transform_type) | |
| self.generate_embeddings(embedding_type=embedding_type, | |
| indexer_type=indexer_type, **kwargs) | |
| self.get_retriever(**kwargs) | |
| qa = self.build_qa(qa_type=kwargs.get("bot_type"), **kwargs) | |
| return qa | |
| if __name__ == "__main__": | |
| qna = QnASystem() | |
| with open("../docs/Doc A.pdf", "rb") as f: | |
| qna.read_and_load_pdf(f) | |
| chain = qna.build_chain( | |
| transform_type=TransformType.RecursiveTransform, | |
| embedding_type=EmbeddingTypes.OPENAI, indexer_type=IndexerType.FAISS, | |
| chain_type="map_reduce", bot_type=BotType.conversational, return_documents=True | |
| ) | |
| question = qna.ask_question(query="Hi! Summarize the document.") | |
| question = qna.ask_question(query="What happened from June 1984 to September 1996") | |
| print(question) | |