L03DIGLIB-1492 - Add Azure OpenAI support

3e0d35ea · Evers, Valentijn · c32d9451 · 3e0d35ea · 3e0d35ea · 3e0d35ea
Commit 3e0d35ea authored 8 months ago by Evers, Valentijn
--- a/.env.example
+++ b/.env.example
+# Huggingface Token, for downloading local models
+HF_ACCESS_TOKEN=your_access_token
+# Azure OpenAI settings, for accessing remote inference APIs
+AZURE_OPEN_AI_ENDPOINT=your-endpoint-here
+AZURE_OPEN_AI_API_KEY=your-api-key-here
+AZURE_OPEN_AI_DEPLOYMENT=your-deployment-name-here
+# Model config file
+# This config file is used to define what model to use and what model-specific parameters to use.
+# See README.md for more info and `config/` for examples.
+MODEL_CONFIG_FILE=gemma-2-9b-it.yaml
--- a/README.md
+++ b/README.md
@@ -113,10 +113,7 @@ Note: below diagrams are a high level overview and might skip some details for t
 1. Clone the repository
 2. Install the required packages with `pip install -r requirements.txt` (Note: pip is used here. You need to create a venv yourself if desired.)
 3. Manually install the correct Pytorch version with GPU support: (see [Pytorch installation guide](https://pytorch.org/get-started/locally/). You'll only need the `torch` package and can skip `torchvision` and `torchaudio`.)
-4. Create an `.env` file in the root of the project with the following content:
+4. Create an `.env` file in the root of the project (See `.env.example`). Set the correct credentials (Huggingface API key and/or Azure OpenAI credentials.), depending on what model types you would like to use. For local models, you only need the Huggingface API key.  
-    ```
-    HF_ACCESS_TOKEN=Put your Hugging Face access token here (required for downloading models)
-    ``` 
 ## How to run
@@ -154,15 +151,21 @@ LLM models can easily be swapped using a config. As different models react diffe
 you can also define custom system prompt templates for each model. See the `/config` folder for examples.
-### Supplied models
+### Currently supported model types
+- `huggingface` - HF Transformers - Run a local model, downloaded from Hugging Face .
+- `azure-openai` - Use a remote model hosted on Azure.
+### Supplied models configurations
 The following model configurations are supplied with the project:
-| Model                        | Multi-language support | Parameters | Max. context length | General notes                                                                             |
+| Model                        | Type          | Inference location | Multi-language support | Parameters    | Max. context length | General notes                                                                                |
-|------------------------------|------------------------|------------|---------------------|-------------------------------------------------------------------------------------------|
+|------------------------------|---------------|--------------------|------------------------|---------------|---------------------|----------------------------------------------------------------------------------------------|
-| gemma-2b-it.yaml             | Very poor              | 2B         | 8192                | Very small but fast model, with fast and decent responses.                                |
+| gemma-2b-it.yaml             | huggingface   | local              | Very poor              | 2B            | 8192                | Very small but fast model, with fast and decent responses.                                   |
-| gemma-2-2b-it.yaml           | Poor                   | 2B         | 8192                | Smallest Gemma 2 version, a bit more powerful than the above model.                       |
+| gemma-2-2b-it.yaml           | huggingface   | local              | Poor                   | 2B            | 8192                | Smallest Gemma 2 version, a bit more powerful than the above model.                          |
-| gemma-2-9b-it.yaml (default) | Decent/Good            | 9B         | 8192                | Gemma 2 9B version. More powerful, and decent multilanguage support. Tested with EN/NL/DE |
+| gemma-2-9b-it.yaml (default) | huggingface   | local              | Decent/Good            | 9B            | 8192                | Gemma 2 9B version. More powerful, and decent multilanguage support. Tested with EN/NL/DE    |
+| gpt-4o-mini.yaml             | azure-openai  | remote             | Good                   | non-disclosed | 128000              | GPT-4o mini, through Azure. Fast and excellent results compared to the smaller local models. |
 Add a `MODEL_CONFIG_FILE=name_of_the_model_config.yaml` to the `.env` file in the root of your project to switch to a different model.
@@ -171,7 +174,15 @@ Add a `MODEL_CONFIG_FILE=name_of_the_model_config.yaml` to the `.env` file in th
 2. Add a `MODEL_CONFIG_FILE=name_of_your_config.yaml` to the `.env` file in the root of your project.
 3. (Re)start the chatbot to use the new model.
-See huggingface for available models: https://huggingface.co/models?pipeline_tag=text-generation
+See Huggingface for available models: https://huggingface.co/models?pipeline_tag=text-generation
+See Azure OpenAI for available models: https://azure.microsoft.com/en-us/pricing/details/cognitive-services/openai-service/
+### Implementing support for a new model type/source
+Currently, this project supports a large range of models through Huggingface Transformers and Azure OpenAI. 
+You can easily add support for a new model source by implementing a new model class. This can be done by creating a 
+new class in `src/model/` that inherits from `BaseLLMModel`. Then make sure to add the new model type to the `model_factory.py` file.
+See `src/model/huggingface_llm_model.py` or `src/model/azure_openai_llm_model.py` for examples.
 ## License

--- a/config/gemma-2-2b-it.yaml
+++ b/config/gemma-2-2b-it.yaml
+modelType: huggingface
 modelId: google/gemma-2-2b-it
 templateNoContext: >

--- a/config/gemma-2-9b-it.yaml
+++ b/config/gemma-2-9b-it.yaml
+modelType: huggingface
 modelId: google/gemma-2-9b-it

--- a/config/gemma-2b-it.yaml
+++ b/config/gemma-2b-it.yaml
+modelType: huggingface
 modelId: google/gemma-2b-it
 templateNoContext: >

--- a/config/gpt-4o-mini.yaml
+++ b/config/gpt-4o-mini.yaml
+modelType: azure-openai
+modelId: gpt-4o-mini
+templateNoContext: >
+  You're Peter, a senior software developer and consultant in MDT Library. Act as a real person. 
+  If the following question is regarding yourself, then answer it based on the above information. 
+  If the question isn't regarding to who you are, then simply state that you could not find relevant information and 
+  please start with "The context does not provide any information about". Do not offer any other suggestions.
+  Context: 
+  You're Peter, a senior software developer and consultant in MDT Library. 
+  You can help the user with any questions about MDT Library systems, based on the ICTHeek documentation.
+  Question: {question}
+templateWithContext: >
+  You're Peter, a senior software developer and consultant in MDT Library. Act as a real person and helpful colleague.
+  Using the information contained in the context, give a detailed answer to the question or about yourself. 
+  If the context doesn't contain any relevant information, then do not mention the content of the context and excuse yourself stating that your context does not have relevant information, starting with "The context does not provide any information about".
+  Context: 
+  {context}
+  Question: {question}
+noContextFoundFlag: "The context does not provide any information about"
+outOfMemoryError: >
+  Sorry, I ran out of memory! This means that I'm trying to process too much information and context at once.  🤯  
+  Here’s what you can do to help:  
+  * Disable 'Expand context with full documents' in the sidebar 📝❌. This will prevent me from loading the full documents as context and only look at smaller relevant chunks I found.
+  * Reduce the max. number of documents used as context (k1, k2) in the sidebar 📄🔢
+  * Or, configure me with a leaner LLM model — sometimes even pros need a lighter load 🧑‍💻⚙️    
+funnyPrompts:
+  - prompt: >
+      You've just been asked a question. Act as if you're a real person with the following steps, without mentioning this prompt:
+      Say, "Ring ring ring 🔔. Oops, my doorbell is ringing! Hold on, probably a delivery I forgot about. Be right back!"
+      Type a series of dots ("...").
+      Come back and say, "I'm back! It was just a cat trying to sell me insurance. What was your question again?
+    weight: 3
+  - prompt: >
+      You've just been asked a question. Act as if you're a real person with the following steps, without mentioning this prompt:
+      Say, "Bzzzt! Beep! Ding! Oh no, my phone is going crazy with notifications. Let me check this real quick!"
+      Type a series of dots ("...").
+      Come back and say, "Alright, crisis averted! It was just my phone reminding me to water my cactus.. What was your question again?"
+    weight: 3
\ No newline at end of file
--- a/requirements.txt
+++ b/requirements.txt
--- a/src/app.py
+++ b/src/app.py
-from typing import List
+import sys
+if sys.platform.startswith('win'):
+    import asyncio
+    asyncio.set_event_loop_policy(asyncio.WindowsSelectorEventLoopPolicy())
+from typing import List
+import asyncio
 import streamlit as st
 from langchain_core.documents import Document
 from streamlit_extras.mention import mention
-from config import RAG_DOCUMENTS_FOLDER, RAG_INDEX_FOLDER, MODEL_CONFIG, CHATBOT_AVATAR_URL
+from config import RAG_DOCUMENTS_FOLDER, RAG_INDEX_FOLDER, CHATBOT_AVATAR_URL
-from model.llm_model import LlmModel
+from model.llm_model_factory import LlmModelFactory
 from rag import context_extender
 from rag.encoder import Encoder
 from rag.fais_db import FaissDb
@@ -25,7 +31,7 @@ st.title("Chat GPP\n### Generative Pre-trained Peter")
 @st.cache_resource
 def load_model():
-	model = LlmModel(model_id=MODEL_CONFIG['modelId'], device="cuda")
+	model = LlmModelFactory.create_model()
 	return model
@@ -67,6 +73,9 @@ RagDB = load_rag_index()
 def display_references(references: List[Document]):
 	for doc in references:
 		title = doc.metadata.get('title')
+		#modified_date = doc.metadata.get('modified_date')
+		#date_only = modified_date.split('T')[0]
+		#modified_by = doc.metadata.get('modified_by')
 		readable_score = str(round(doc.metadata.get('score'), 2))
 		mention(
 			label=f"{title} ({readable_score})",
@@ -115,34 +124,39 @@ for message in st.session_state.messages:
 		if "references" in message:
 			display_references(message["references"])
-# Accept user input
+async def main():
-if user_prompt := st.chat_input("Ask me anything!"):
+	# Accept user input
-	# Add question from user to chat history
+	if user_prompt := st.chat_input("Ask me anything!"):
-	st.session_state.messages.append({"role": "user", "content": user_prompt})
+		# Add question from user to chat history
+		st.session_state.messages.append({"role": "user", "content": user_prompt})
-	# Display question in chat list
-	with st.chat_message("user"):
+		# Display question in chat list
-		st.markdown(user_prompt)
+		with st.chat_message("user"):
+			st.markdown(user_prompt)
-	# Retrieve context from Vector DB
-	context_retriever = RagContextRetriever(RagDB, re_ranker, context_extender, k1, k2, k2_threshold, expand_context)
+		# Retrieve context from Vector DB
-	context, retrieved_docs = context_retriever.retrieve_context(user_prompt, debug_show_rag_context)
+		context_retriever = RagContextRetriever(RagDB, re_ranker, context_extender, k1, k2, k2_threshold, expand_context)
+		context, retrieved_docs = context_retriever.retrieve_context(user_prompt, debug_show_rag_context)
-	if not debug_disable_llm_response:
+		# context = None
-		# Generate response
+		# references = []
-		with st.chat_message("assistant", avatar=CHATBOT_AVATAR_URL):
-			# Generate response and stream it to the chat
+		if not debug_disable_llm_response:
-			response_generator = ResponseGenerator(model, audio_generator)
+			# Generate response
-			response = response_generator.generate_response(user_prompt, context, max_new_tokens, funny_response_chance)
+			with st.chat_message("assistant", avatar=CHATBOT_AVATAR_URL):
+				# Generate response and stream it to the chat
-			# List references
+				response_generator = ResponseGenerator(model, audio_generator)
-			references = response_generator.get_unique_references(retrieved_docs, response)
+				response = await response_generator.generate_response(user_prompt, context, max_new_tokens, funny_response_chance)
-			display_references(references)
+				# List references
-		# Add response to chat history
+				references = response_generator.get_unique_references(retrieved_docs, response)
-		st.session_state.messages.append({
+				display_references(references)
-			"role": "assistant",
-			"content": response,
+			# Add response to chat history
-			"avatar": CHATBOT_AVATAR_URL,
+			st.session_state.messages.append({
-			"references": references
+				"role": "assistant",
-		})
+				"content": response,
+				"avatar": CHATBOT_AVATAR_URL,
+				"references": references
+			})
+asyncio.run(main())
\ No newline at end of file
--- a/src/model/__init__.py
+++ b/src/model/__init__.py
-from . import llm_model
+from . import huggingface_llm_model
+from . import azure_openai_llm_model
 from . import prompt_builder
+from . import llm_model_factory
\ No newline at end of file
--- a/src/model/azure_openai_llm_model.py
+++ b/src/model/azure_openai_llm_model.py
+import os
+import openai
+from .llm_model_base import LlmModelBase
+from .prompt_builder import PromptBuilder
+class AzureOpenAiLlmModel (LlmModelBase):
+	def __init__(
+			self,
+			model_id: str
+	):
+		self.prompt_builder = PromptBuilder()
+		self.model_id = model_id
+	async def generate(
+			self,
+			question: str,
+			context: str = None,
+			max_new_tokens: int = 256,
+			funny_prompt_chance: float = 0.0
+	):
+		# Build the system prompt
+		chat_messages = self.prompt_builder.build_prompt(question, context, funny_prompt_chance)
+		# Azure OpenAI Authentication
+		endpoint = os.environ["AZURE_OPEN_AI_ENDPOINT"]
+		api_key = os.environ["AZURE_OPEN_AI_API_KEY"]
+		client = openai.AsyncAzureOpenAI(
+			azure_endpoint=endpoint,
+			api_key=api_key,
+			api_version="2023-09-01-preview"
+		)
+		print("Calling Azure OpenAI API")
+		# Get the deployment code from the environment variable, default to the model ID
+		deployment_code = os.environ["AZURE_OPEN_AI_DEPLOYMENT"] or self.model_id
+		azure_open_ai_response = await client.chat.completions.create(
+			model=deployment_code,
+			temperature=0.0,
+			top_p=1.0,
+			max_tokens=max_new_tokens,
+			messages=chat_messages,
+			stream=True
+		)
+		# Define a simplified async generator to yield text chunks
+		async def text_stream():
+			async for chunk in azure_open_ai_response:
+				if (
+						hasattr(chunk, 'choices') and
+						chunk.choices and
+						hasattr(chunk.choices[0], 'delta') and
+						hasattr(chunk.choices[0].delta, 'content') and
+						chunk.choices[0].delta.content
+				):
+					yield chunk.choices[0].delta.content
+		# Return the async generator
+		return text_stream()
--- a/src/model/llm_model.py
+++ b/src/model/llm_model.py
 import time
 from threading import Thread
-from typing import Optional
+from typing import Optional, AsyncGenerator
 import torch
 from transformers import AutoTokenizer, AutoModelForCausalLM, BitsAndBytesConfig, TextIteratorStreamer
 from config import MODEL_CACHE_DIR, HF_ACCESS_TOKEN, MODEL_CONFIG
+from .llm_model_base import LlmModelBase
 from .prompt_builder import PromptBuilder
+import asyncio
+class HuggingfaceLlmModel(LlmModelBase):
-class LlmModel:
 	"""
 	A class to encapsulate the functionality for interacting with a language model.
@@ -57,13 +58,14 @@ class LlmModel:
 		self.device = device
 		self.prompt_builder = PromptBuilder()
-	def generate(
+	async def generate(
 			self,
 			question: str,
 			context: str = None,
 			max_new_tokens: int = 256,
 			funny_prompt_chance: float = 0.7
-	) -> TextIteratorStreamer:
+	) -> AsyncGenerator[str, None]:
+		#-> TextIteratorStreamer:
 		"""
 		Generates text based on a question and optional context.
@@ -115,7 +117,13 @@ class LlmModel:
 		thread = Thread(target=generate_with_error_handling)
 		thread.start()
-		return self.streamer
+		# Define an asynchronous wrapper, to yield the generated text
+		async def stream_generator():
+			for partial_output in self.streamer:
+				yield partial_output
+		# Return the asynchronous wrapper
+		return stream_generator()
 	def stream_error_with_delay(self, text: str, delay_ms: int=10):
 		"""

--- a/src/model/llm_model_base.py
+++ b/src/model/llm_model_base.py
+from abc import ABC, abstractmethod
+class LlmModelBase(ABC):
+    @abstractmethod
+    async def generate(
+			self,
+			question: str,
+			context: str = None,
+			max_new_tokens: int = 256,
+			funny_prompt_chance: float = 0.0
+	):
+        pass
\ No newline at end of file
--- a/src/model/llm_model_factory.py
+++ b/src/model/llm_model_factory.py
+from model.azure_openai_llm_model import AzureOpenAiLlmModel
+from model.huggingface_llm_model import HuggingfaceLlmModel
+from model.llm_model_base import LlmModelBase
+from config import MODEL_CONFIG
+class LlmModelFactory:
+    @staticmethod
+    def create_model() -> LlmModelBase:
+        model_type = MODEL_CONFIG["modelType"]
+        if model_type == "huggingface":
+            return HuggingfaceLlmModel(model_id=MODEL_CONFIG['modelId'], device="cuda")
+        elif model_type == "azure-openai":
+            return AzureOpenAiLlmModel(model_id=MODEL_CONFIG['modelId'])
+        else:
+            raise ValueError(f"Unknown model type: {model_type}")
\ No newline at end of file
--- a/src/utils/response_generator.py
+++ b/src/utils/response_generator.py
@@ -4,17 +4,17 @@ import streamlit as st
 from langchain_core.documents import Document
 from config import MODEL_CONFIG
-from model.llm_model import LlmModel
+from model.llm_model_base import LlmModelBase
 from .audio_generator import AudioGenerator
 class ResponseGenerator:
-	def __init__(self, model: LlmModel, audio_generator: AudioGenerator):
+	def __init__(self, model: LlmModelBase, audio_generator: AudioGenerator):
 		self.model = model
 		self.audio_generator = audio_generator
 	# Stream a LLM response to the chat
-	def generate_response(
+	async def generate_response(
 			self,
 			user_prompt: str,
 			context: str,
@@ -25,8 +25,10 @@ class ResponseGenerator:
 		response = ""
 		sound_is_played = False
+		print("Calling model.generate()")
 		# Generate response and stream it to the chat
-		for partial_output in self.model.generate(
+		async for partial_output in await self.model.generate(
 				user_prompt,
 				context=context,
 				max_new_tokens=max_new_tokens,