0% found this document useful (0 votes)
10 views6 pages

Rag Model

The document outlines a Python script that processes emails using Google Generative AI and LangChain. It includes functions for cleaning email content, extracting relevant data, appending data to an Excel file, and generating replies based on previous email context. The script also demonstrates how to handle a sample email, retrieve relevant chunks from a database, and update an Excel file with extracted information.
Copyright
© © All Rights Reserved
We take content rights seriously. If you suspect this is your content, claim it here.
Available Formats
Download as DOCX, PDF, TXT or read online on Scribd
0% found this document useful (0 votes)
10 views6 pages

Rag Model

The document outlines a Python script that processes emails using Google Generative AI and LangChain. It includes functions for cleaning email content, extracting relevant data, appending data to an Excel file, and generating replies based on previous email context. The script also demonstrates how to handle a sample email, retrieve relevant chunks from a database, and update an Excel file with extracted information.
Copyright
© © All Rights Reserved
We take content rights seriously. If you suspect this is your content, claim it here.
Available Formats
Download as DOCX, PDF, TXT or read online on Scribd
You are on page 1/ 6

import pandas as pd

import re

import os

import google.generativeai as genai

from langchain_text_splitters import RecursiveCharacterTextSplitter

from sentence_transformers import SentenceTransformer

from chromadb import Client

# from dotenv import load_dotenv # No longer needed if using Colab secrets

from google.colab import userdata

# Load environment variables (using Colab secrets now)

API_KEY = userdata.get("GOOGLE_API_KEY")

if not API_KEY:

raise ValueError("GOOGLE_API_KEY not found. Please set it in Colab secrets.")

genai.configure(api_key=API_KEY)

# Use the working model

model = genai.GenerativeModel('gemini-1.5-flash-latest')

# Initialize components (assuming these are already initialized from previous steps)

# Initialize the text splitter

text_splitter = RecursiveCharacterTextSplitter(

chunk_size=100,

chunk_overlap=20,

length_function=len,

is_separator_regex=False,

)
# Initialize the embedding model

embedding_model = SentenceTransformer('all-MiniLM-L6-v2')

# Initialize an in-memory ChromaDB client and get the collection

client = Client()

collection = client.get_or_create_collection(name="email_chunks")

# Define cleaning, extraction, and append functions (re-defining for clarity in this block)

def clean_email_content(email_content):

"""

Cleans and formats email content by removing HTML tags and special characters.

"""

cleaned_content = re.sub(r'<.*?>', '', email_content)

cleaned_content = re.sub(r'[^a-zA-Z0-9\s.,!?;:]', '', cleaned_content)

cleaned_content = re.sub(r'\s+', ' ', cleaned_content).strip()

return cleaned_content

def extract_data_for_excel(email_content):

"""

Extracts sender, subject, content summary, and contact number from email content.

(Simplified for this example, assuming sender/subject are known from the email structure)

"""

extracted_data = {

'Sender': 'N/A',

'Subject': 'N/A',

'Summary': 'N/A',

'Contact': 'N/A'

extracted_data['Summary'] = email_content[:100] + '...' if len(email_content) > 100 else


email_content
contact_match = re.search(r'[\d\s-]{7,}', email_content)

if contact_match:

extracted_data['Contact'] = contact_match.group(0).strip()

return extracted_data

def append_to_excel(data_dict, file_path):

"""

Appends extracted email data to an Excel file.

"""

# Ensure the directory exists

os.makedirs(os.path.dirname(file_path), exist_ok=True)

if not os.path.exists(file_path):

df = pd.DataFrame([data_dict])

df.to_excel(file_path, index=False)

else:

df = pd.DataFrame([data_dict])

with pd.ExcelWriter(file_path, mode='a', engine='openpyxl', if_sheet_exists='overlay') as writer:

df.to_excel(writer, index=False, header=False, startrow=writer.book.active.max_row)

def search_email_chunks(query_string, n_results=3):

"""

Searches the ChromaDB collection for email chunks relevant to a query.

"""

query_embedding = embedding_model.encode(query_string).tolist()

results = collection.query(

query_embeddings=[query_embedding],

n_results=n_results,

include=['documents', 'distances', 'metadatas']


)

return results

def generate_reply(original_email_content, retrieved_chunks):

"""

Generates a suitable email reply using the original email and retrieved context.

"""

prompt = f"""Original Email:

{original_email_content}

Relevant Context from past emails:

"""

for i, chunk in enumerate(retrieved_chunks):

prompt += f"Chunk {i+1}: {chunk}\n"

prompt += "\nBased on the original email and the relevant context, generate a concise and helpful
reply."

try:

response = model.generate_content(prompt)

return response.text

except Exception as e:

print(f"Error generating content: {e}")

return "Could not generate a reply at this time."

# Integrated function to process a new email

def process_new_email(new_email_content_raw, sender, subject,


excel_file_path='extracted_email_data.xlsx'):

"""

Simulates receiving and processing a new email through the RAG workflow.

"""
print(f"Processing new email from {sender} with subject: {subject}")

try:

# 1. Clean the email content

cleaned_content = clean_email_content(new_email_content_raw)

print("Email content cleaned.")

# 2. Chunk the cleaned content (optional for RAG query, but good for adding to DB if needed)

# chunks = text_splitter.split_text(cleaned_content)

# print(f"Email content chunked into {len(chunks)} chunks.")

# 3. Retrieve relevant context from the vector database

retrieval_results = search_email_chunks(cleaned_content)

retrieved_chunks = [doc for sublist in retrieval_results.get('documents', []) for doc in sublist]

print(f"Retrieved {len(retrieved_chunks)} relevant chunks.")

# 4. Generate a reply using the language model

generated_reply = generate_reply(cleaned_content, retrieved_chunks)

print("\nGenerated Reply:")

print(generated_reply)

# 5. Extract data for Excel and append

extracted_data = extract_data_for_excel(cleaned_content)

extracted_data['Sender'] = sender # Add sender from input

extracted_data['Subject'] = subject # Add subject from input

append_to_excel(extracted_data, excel_file_path)

print(f"\nExtracted data appended to {excel_file_path}.")

return generated_reply

except Exception as e:

print(f"\nAn error occurred during email processing: {e}")


return "An error occurred during processing."

# 3. Create a sample "new" email string

sample_new_email_content = """

Hello team,

Just following up on the project update meeting. Could someone confirm the deadline for phase 2
implementation?

Also, please let me know if you need anything from my side. You can reach me at 555-1234.

Thanks,

Alice

"""

sample_sender = "Alice"

sample_subject = "Follow up on Project Update"

# 4. Call the integrated function with the sample new email

print("--- Starting New Email Processing ---")

process_new_email(sample_new_email_content, sample_sender, sample_subject)

print("--- New Email Processing Finished ---")

# 5. Optionally, read and display the updated Excel file

try:

updated_extracted_df = pd.read_excel('extracted_email_data.xlsx')

print("\nUpdated Excel File Content:")

display(updated_extracted_df)

except FileNotFoundError:

print("\nExcel file not found after processing.")

You might also like