OPENAI - VECTOR EMBEDDING RAG
Simon-Pierre Boucher
2024-09-14
In [1]:
import os
import requests
from dotenv import load_dotenv
from IPython.display import display, HTML
import re
# Load environment variables from the .env file
load_dotenv()
# Get the API key from environment variables
api_key = os.getenv("OPENAI_API_KEY")
In [2]:
def read_and_split_book(file_path, chunk_size=500):
"""
Read a book from a text file and split it into smaller chunks.
Parameters:
- file_path (str): The path to the text file containing the book.
- chunk_size (int): The number of characters per chunk.
Returns:
- chunks (list of str): A list of chunks from the book.
"""
try:
with open(file_path, 'r', encoding='utf-8') as file:
text = file.read()
# Split the text into fixed-size chunks
chunks = [text[i:i+chunk_size] for i in range(0, len(text), chunk_size)]
return chunks
except FileNotFoundError:
print(f"File not found: {file_path}")
return []
except Exception as e:
print(f"An error occurred: {e}")
return []
In [5]:
import numpy as np
import requests
def get_embeddings(texts, api_key):
"""
Get embeddings for a list of texts using OpenAI's embedding model.
Parameters:
- texts (list of str): The list of texts to get embeddings for.
- api_key (str): The API key for OpenAI.
Returns:
- embeddings (list of list of float): A list of embeddings.
"""
url = "https://api.openai.com/v1/embeddings"
headers = {
"Content-Type": "application/json",
"Authorization": f"Bearer {api_key}"
}
data = {
"model": "text-embedding-ada-002",
"input": texts
}
try:
response = requests.post(url, headers=headers, json=data)
response.raise_for_status()
result = response.json()
embeddings = [item['embedding'] for item in result['data']]
return embeddings
except requests.exceptions.HTTPError as http_err:
print(f"HTTP error occurred: {http_err}")
print(f"Response content: {response.content}")
return []
except requests.exceptions.RequestException as e:
print(f"An error occurred: {e}")
return []
def cosine_similarity(vec1, vec2):
"""Compute the cosine similarity between two vectors."""
return np.dot(vec1, vec2) / (np.linalg.norm(vec1) * np.linalg.norm(vec2))
In [6]:
def retrieve_documents(query, file_path, api_key, max_chunks=5):
"""
Retrieve relevant documents from a book file based on a query using semantic search.
Parameters:
- query (str): The user's query.
- file_path (str): The path to the text file containing the book.
- api_key (str): The API key for OpenAI.
- max_chunks (int): Maximum number of chunks to return based on relevance.
Returns:
- relevant_chunks (list of str): A list of top relevant chunks sorted by similarity.
"""
# Step 1: Read and split the book into chunks
chunks = read_and_split_book(file_path)
# If chunks are empty, return empty list
if not chunks:
return []
# Step 2: Get embeddings for the query and the chunks
query_embedding = get_embeddings([query], api_key)[0]
chunks_embeddings = get_embeddings(chunks, api_key)
# Step 3: Calculate cosine similarity for each chunk in the book
similarities = [cosine_similarity(query_embedding, chunk_embedding) for chunk_embedding in chunks_embeddings]
# Step 4: Sort chunks by similarity in descending order
sorted_chunks = sorted(zip(chunks, similarities), key=lambda x: x[1], reverse=True)
# Step 5: Select top N relevant chunks
relevant_chunks = [chunk for chunk, sim in sorted_chunks[:max_chunks]]
return relevant_chunks
In [7]:
def retrieve_documents(query, file_path, api_key, max_chunks=5):
"""
Retrieve relevant documents from a book file based on a query using semantic search.
Parameters:
- query (str): The user's query.
- file_path (str): The path to the text file containing the book.
- api_key (str): The API key for OpenAI.
- max_chunks (int): Maximum number of chunks to return based on relevance.
Returns:
- relevant_chunks (list of str): A list of top relevant chunks sorted by similarity.
"""
# Step 1: Read and split the book into chunks
chunks = read_and_split_book(file_path)
# If chunks are empty, return empty list
if not chunks:
return []
# Step 2: Get embeddings for the query and the chunks
query_embedding = get_embeddings([query], api_key)[0]
chunks_embeddings = get_embeddings(chunks, api_key)
# Step 3: Calculate cosine similarity for each chunk in the book
similarities = [cosine_similarity(query_embedding, chunk_embedding) for chunk_embedding in chunks_embeddings]
# Step 4: Sort chunks by similarity in descending order
sorted_chunks = sorted(zip(chunks, similarities), key=lambda x: x[1], reverse=True)
# Step 5: Select top N relevant chunks
relevant_chunks = [chunk for chunk, sim in sorted_chunks[:max_chunks]]
return relevant_chunks
In [8]:
def truncate_context(context, max_tokens=2048):
"""
Truncate the context to fit within the max token limit for OpenAI's API.
Parameters:
- context (str): The context string to be truncated.
- max_tokens (int): The maximum number of tokens allowed.
Returns:
- truncated_context (str): The truncated context string.
"""
# Convert the context into tokens (approximation: 1 token ~ 4 characters in English)
max_characters = max_tokens * 4
# Truncate the context if it's longer than the max allowed characters
if len(context) > max_characters:
print(f"Truncating context to {max_characters} characters.")
return context[:max_characters]
return context
In [9]:
def generate_openai_response(api_key, model, query, context):
"""
Generate a response using OpenAI's API based on the query and context.
Parameters:
- api_key (str): The API key for OpenAI.
- model (str): The model to use for text generation.
- query (str): The user's query.
- context (str): The retrieved context information.
Returns:
- response (str): The generated response from OpenAI.
"""
url = "https://api.openai.com/v1/chat/completions"
headers = {
"Content-Type": "application/json",
"Authorization": f"Bearer {api_key}"
}
messages = [
{"role": "system", "content": "You are an AI assistant that provides detailed and accurate responses based on given context."},
{"role": "user", "content": f"Context: {context}\n\nQuestion: {query}"}
]
data = {
"model": model,
"messages": messages,
"temperature": 0.7,
"max_tokens": 2000,
"top_p": 1.0,
"frequency_penalty": 0.0,
"presence_penalty": 0.0
}
try:
response = requests.post(url, headers=headers, json=data)
response.raise_for_status()
result = response.json()
return result["choices"][0]["message"]["content"]
except requests.exceptions.RequestException as e:
print(f"An error occurred: {e}")
return "Error in generating response."
In [10]:
import logging
# Setup logging
logging.basicConfig(filename='rag_system.log', level=logging.DEBUG, format='%(asctime)s - %(levelname)s - %(message)s')
def rag_system(api_key, model, query, file_path):
"""
Perform Retrieval-Augmented Generation (RAG) using OpenAI's API.
Parameters:
- api_key (str): The API key for OpenAI.
- model (str): The model to use for text generation.
- query (str): The user's query.
- file_path (str): The path to the text file containing the book.
Returns:
- response (str): The generated response from OpenAI based on retrieved chunks.
"""
# Step 1: Retrieve relevant chunks using semantic search
retrieved_chunks = retrieve_documents(query, file_path, api_key)
# Step 2: Handle no relevant chunks found
if not retrieved_chunks:
context = "No relevant information was found in the book."
else:
# Combine retrieved chunks into a single context string
context = " ".join(retrieved_chunks)
# Step 3: Log the context for debugging (optional)
logging.debug("Generated Context: %s", context)
logging.debug("Context length (in characters): %d", len(context))
# Step 4: Truncate context if necessary
context = truncate_context(context, max_tokens=2048)
# Step 5: Generate response based on context and query
response = generate_openai_response(api_key, model, query, context)
return response
In [12]:
# Example usage of the RAG system with a text file as the corpus
# Path to the text file containing the book
file_path = "corpus.txt" # Replace with your actual file path
# User query
query = "qui est le père pauvre et pourquoi auteur dit cela"
# OpenAI API key and model
model = "gpt-4o-mini"
# Generate response using the RAG system with chunk-based retrieval
response = rag_system(api_key, model, query, file_path)
print(response)
In [ ]: