OPENAI - VECTOR EMBEDDING RAG
Simon-Pierre Boucher
2024-09-14
In [1]:
import os
import requests
from dotenv import load_dotenv
from IPython.display import display, HTML
import re

# Load environment variables from the .env file
load_dotenv()

# Get the API key from environment variables
api_key = os.getenv("OPENAI_API_KEY")
In [2]:
def read_and_split_book(file_path, chunk_size=500):
    """
    Read a book from a text file and split it into smaller chunks.

    Parameters:
    - file_path (str): The path to the text file containing the book.
    - chunk_size (int): The number of characters per chunk.

    Returns:
    - chunks (list of str): A list of chunks from the book.
    """
    try:
        with open(file_path, 'r', encoding='utf-8') as file:
            text = file.read()
        
        # Split the text into fixed-size chunks
        chunks = [text[i:i+chunk_size] for i in range(0, len(text), chunk_size)]
        return chunks
    except FileNotFoundError:
        print(f"File not found: {file_path}")
        return []
    except Exception as e:
        print(f"An error occurred: {e}")
        return []
In [5]:
import numpy as np
import requests

def get_embeddings(texts, api_key):
    """
    Get embeddings for a list of texts using OpenAI's embedding model.

    Parameters:
    - texts (list of str): The list of texts to get embeddings for.
    - api_key (str): The API key for OpenAI.

    Returns:
    - embeddings (list of list of float): A list of embeddings.
    """
    url = "https://api.openai.com/v1/embeddings"
    headers = {
        "Content-Type": "application/json",
        "Authorization": f"Bearer {api_key}"
    }
    data = {
        "model": "text-embedding-ada-002",
        "input": texts
    }

    try:
        response = requests.post(url, headers=headers, json=data)
        response.raise_for_status()
        result = response.json()
        embeddings = [item['embedding'] for item in result['data']]
        return embeddings
    except requests.exceptions.HTTPError as http_err:
        print(f"HTTP error occurred: {http_err}")
        print(f"Response content: {response.content}")
        return []
    except requests.exceptions.RequestException as e:
        print(f"An error occurred: {e}")
        return []

def cosine_similarity(vec1, vec2):
    """Compute the cosine similarity between two vectors."""
    return np.dot(vec1, vec2) / (np.linalg.norm(vec1) * np.linalg.norm(vec2))
In [6]:
def retrieve_documents(query, file_path, api_key, max_chunks=5):
    """
    Retrieve relevant documents from a book file based on a query using semantic search.

    Parameters:
    - query (str): The user's query.
    - file_path (str): The path to the text file containing the book.
    - api_key (str): The API key for OpenAI.
    - max_chunks (int): Maximum number of chunks to return based on relevance.

    Returns:
    - relevant_chunks (list of str): A list of top relevant chunks sorted by similarity.
    """
    # Step 1: Read and split the book into chunks
    chunks = read_and_split_book(file_path)
    
    # If chunks are empty, return empty list
    if not chunks:
        return []

    # Step 2: Get embeddings for the query and the chunks
    query_embedding = get_embeddings([query], api_key)[0]
    chunks_embeddings = get_embeddings(chunks, api_key)

    # Step 3: Calculate cosine similarity for each chunk in the book
    similarities = [cosine_similarity(query_embedding, chunk_embedding) for chunk_embedding in chunks_embeddings]

    # Step 4: Sort chunks by similarity in descending order
    sorted_chunks = sorted(zip(chunks, similarities), key=lambda x: x[1], reverse=True)
    
    # Step 5: Select top N relevant chunks
    relevant_chunks = [chunk for chunk, sim in sorted_chunks[:max_chunks]]
    
    return relevant_chunks
In [7]:
def retrieve_documents(query, file_path, api_key, max_chunks=5):
    """
    Retrieve relevant documents from a book file based on a query using semantic search.

    Parameters:
    - query (str): The user's query.
    - file_path (str): The path to the text file containing the book.
    - api_key (str): The API key for OpenAI.
    - max_chunks (int): Maximum number of chunks to return based on relevance.

    Returns:
    - relevant_chunks (list of str): A list of top relevant chunks sorted by similarity.
    """
    # Step 1: Read and split the book into chunks
    chunks = read_and_split_book(file_path)
    
    # If chunks are empty, return empty list
    if not chunks:
        return []

    # Step 2: Get embeddings for the query and the chunks
    query_embedding = get_embeddings([query], api_key)[0]
    chunks_embeddings = get_embeddings(chunks, api_key)

    # Step 3: Calculate cosine similarity for each chunk in the book
    similarities = [cosine_similarity(query_embedding, chunk_embedding) for chunk_embedding in chunks_embeddings]

    # Step 4: Sort chunks by similarity in descending order
    sorted_chunks = sorted(zip(chunks, similarities), key=lambda x: x[1], reverse=True)
    
    # Step 5: Select top N relevant chunks
    relevant_chunks = [chunk for chunk, sim in sorted_chunks[:max_chunks]]
    
    return relevant_chunks
In [8]:
def truncate_context(context, max_tokens=2048):
    """
    Truncate the context to fit within the max token limit for OpenAI's API.

    Parameters:
    - context (str): The context string to be truncated.
    - max_tokens (int): The maximum number of tokens allowed.

    Returns:
    - truncated_context (str): The truncated context string.
    """
    # Convert the context into tokens (approximation: 1 token ~ 4 characters in English)
    max_characters = max_tokens * 4
    
    # Truncate the context if it's longer than the max allowed characters
    if len(context) > max_characters:
        print(f"Truncating context to {max_characters} characters.")
        return context[:max_characters]
    return context
In [9]:
def generate_openai_response(api_key, model, query, context):
    """
    Generate a response using OpenAI's API based on the query and context.

    Parameters:
    - api_key (str): The API key for OpenAI.
    - model (str): The model to use for text generation.
    - query (str): The user's query.
    - context (str): The retrieved context information.

    Returns:
    - response (str): The generated response from OpenAI.
    """
    url = "https://api.openai.com/v1/chat/completions"
    headers = {
        "Content-Type": "application/json",
        "Authorization": f"Bearer {api_key}"
    }
    messages = [
        {"role": "system", "content": "You are an AI assistant that provides detailed and accurate responses based on given context."},
        {"role": "user", "content": f"Context: {context}\n\nQuestion: {query}"}
    ]
    data = {
        "model": model,
        "messages": messages,
        "temperature": 0.7,
        "max_tokens": 2000,
        "top_p": 1.0,
        "frequency_penalty": 0.0,
        "presence_penalty": 0.0
    }

    try:
        response = requests.post(url, headers=headers, json=data)
        response.raise_for_status()
        result = response.json()
        return result["choices"][0]["message"]["content"]
    except requests.exceptions.RequestException as e:
        print(f"An error occurred: {e}")
        return "Error in generating response."
In [10]:
import logging

# Setup logging
logging.basicConfig(filename='rag_system.log', level=logging.DEBUG, format='%(asctime)s - %(levelname)s - %(message)s')

def rag_system(api_key, model, query, file_path):
    """
    Perform Retrieval-Augmented Generation (RAG) using OpenAI's API.

    Parameters:
    - api_key (str): The API key for OpenAI.
    - model (str): The model to use for text generation.
    - query (str): The user's query.
    - file_path (str): The path to the text file containing the book.

    Returns:
    - response (str): The generated response from OpenAI based on retrieved chunks.
    """
    # Step 1: Retrieve relevant chunks using semantic search
    retrieved_chunks = retrieve_documents(query, file_path, api_key)
    
    # Step 2: Handle no relevant chunks found
    if not retrieved_chunks:
        context = "No relevant information was found in the book."
    else:
        # Combine retrieved chunks into a single context string
        context = " ".join(retrieved_chunks)

    # Step 3: Log the context for debugging (optional)
    logging.debug("Generated Context: %s", context)
    logging.debug("Context length (in characters): %d", len(context))

    # Step 4: Truncate context if necessary
    context = truncate_context(context, max_tokens=2048)

    # Step 5: Generate response based on context and query
    response = generate_openai_response(api_key, model, query, context)
    
    return response
In [12]:
# Example usage of the RAG system with a text file as the corpus

# Path to the text file containing the book
file_path = "corpus.txt"  # Replace with your actual file path

# User query
query = "qui est le père pauvre et pourquoi auteur dit cela"

# OpenAI API key and model

model = "gpt-4o-mini"

# Generate response using the RAG system with chunk-based retrieval
response = rag_system(api_key, model, query, file_path)
print(response)
Le père pauvre, dans ce contexte, est un personnage qui représente une mentalité financière traditionnelle, souvent associée à la sécurité de l'emploi et à la valeur de l'éducation formelle. Il est décrit comme étant très instruit, mais malgré cela, il demeure financièrement pauvre. L'auteur souligne que son père pauvre avait une vision pessimiste sur la richesse, affirmant des choses comme « Je ne serai jamais riche », ce qui reflète une mentalité de limitation et une prophétie auto-réalisation.

L'auteur mentionne cela pour contraster les philosophies de son père pauvre et de son père riche. Le père riche, au contraire, avait une mentalité positive et proactive envers la richesse, se définissant lui-même comme un homme riche même dans les moments difficiles. Ce contraste sert à illustrer l'importance de la mentalité et des croyances personnelles dans la création de richesse et la réussite financière. L'idée est que les attitudes et les croyances que l'on entretient à propos de l'argent peuvent influencer de manière significative les résultats financiers d'une personne.
In [ ]: