import pandas as pd
import re
import os
import google.generativeai as genai
from langchain_text_splitters import RecursiveCharacterTextSplitter
from sentence_transformers import SentenceTransformer
from chromadb import Client
# from dotenv import load_dotenv # No longer needed if using Colab secrets
from google.colab import userdata
# Load environment variables (using Colab secrets now)
API_KEY = userdata.get("GOOGLE_API_KEY")
if not API_KEY:
    raise ValueError("GOOGLE_API_KEY not found. Please set it in Colab secrets.")
genai.configure(api_key=API_KEY)
# Use the working model
model = genai.GenerativeModel('gemini-1.5-flash-latest')
# Initialize components (assuming these are already initialized from previous steps)
# Initialize the text splitter
text_splitter = RecursiveCharacterTextSplitter(
    chunk_size=100,
    chunk_overlap=20,
    length_function=len,
    is_separator_regex=False,
)
# Initialize the embedding model
embedding_model = SentenceTransformer('all-MiniLM-L6-v2')
# Initialize an in-memory ChromaDB client and get the collection
client = Client()
collection = client.get_or_create_collection(name="email_chunks")
# Define cleaning, extraction, and append functions (re-defining for clarity in this block)
def clean_email_content(email_content):
  """
  Cleans and formats email content by removing HTML tags and special characters.
  """
  cleaned_content = re.sub(r'<.*?>', '', email_content)
  cleaned_content = re.sub(r'[^a-zA-Z0-9\s.,!?;:]', '', cleaned_content)
  cleaned_content = re.sub(r'\s+', ' ', cleaned_content).strip()
  return cleaned_content
def extract_data_for_excel(email_content):
  """
  Extracts sender, subject, content summary, and contact number from email content.
  (Simplified for this example, assuming sender/subject are known from the email structure)
  """
  extracted_data = {
      'Sender': 'N/A',
      'Subject': 'N/A',
      'Summary': 'N/A',
      'Contact': 'N/A'
  extracted_data['Summary'] = email_content[:100] + '...' if len(email_content) > 100 else
email_content
  contact_match = re.search(r'[\d\s-]{7,}', email_content)
  if contact_match:
    extracted_data['Contact'] = contact_match.group(0).strip()
  return extracted_data
def append_to_excel(data_dict, file_path):
  """
  Appends extracted email data to an Excel file.
  """
  # Ensure the directory exists
  os.makedirs(os.path.dirname(file_path), exist_ok=True)
  if not os.path.exists(file_path):
    df = pd.DataFrame([data_dict])
    df.to_excel(file_path, index=False)
  else:
    df = pd.DataFrame([data_dict])
    with pd.ExcelWriter(file_path, mode='a', engine='openpyxl', if_sheet_exists='overlay') as writer:
          df.to_excel(writer, index=False, header=False, startrow=writer.book.active.max_row)
def search_email_chunks(query_string, n_results=3):
  """
  Searches the ChromaDB collection for email chunks relevant to a query.
  """
  query_embedding = embedding_model.encode(query_string).tolist()
  results = collection.query(
    query_embeddings=[query_embedding],
    n_results=n_results,
    include=['documents', 'distances', 'metadatas']
  )
  return results
def generate_reply(original_email_content, retrieved_chunks):
  """
  Generates a suitable email reply using the original email and retrieved context.
  """
  prompt = f"""Original Email:
{original_email_content}
Relevant Context from past emails:
"""
  for i, chunk in enumerate(retrieved_chunks):
      prompt += f"Chunk {i+1}: {chunk}\n"
  prompt += "\nBased on the original email and the relevant context, generate a concise and helpful
reply."
  try:
      response = model.generate_content(prompt)
      return response.text
  except Exception as e:
      print(f"Error generating content: {e}")
      return "Could not generate a reply at this time."
# Integrated function to process a new email
def process_new_email(new_email_content_raw, sender, subject,
excel_file_path='extracted_email_data.xlsx'):
  """
  Simulates receiving and processing a new email through the RAG workflow.
  """
print(f"Processing new email from {sender} with subject: {subject}")
try:
  # 1. Clean the email content
  cleaned_content = clean_email_content(new_email_content_raw)
  print("Email content cleaned.")
  # 2. Chunk the cleaned content (optional for RAG query, but good for adding to DB if needed)
  # chunks = text_splitter.split_text(cleaned_content)
  # print(f"Email content chunked into {len(chunks)} chunks.")
  # 3. Retrieve relevant context from the vector database
  retrieval_results = search_email_chunks(cleaned_content)
  retrieved_chunks = [doc for sublist in retrieval_results.get('documents', []) for doc in sublist]
  print(f"Retrieved {len(retrieved_chunks)} relevant chunks.")
  # 4. Generate a reply using the language model
  generated_reply = generate_reply(cleaned_content, retrieved_chunks)
  print("\nGenerated Reply:")
  print(generated_reply)
  # 5. Extract data for Excel and append
  extracted_data = extract_data_for_excel(cleaned_content)
  extracted_data['Sender'] = sender # Add sender from input
  extracted_data['Subject'] = subject # Add subject from input
  append_to_excel(extracted_data, excel_file_path)
  print(f"\nExtracted data appended to {excel_file_path}.")
  return generated_reply
except Exception as e:
  print(f"\nAn error occurred during email processing: {e}")
       return "An error occurred during processing."
# 3. Create a sample "new" email string
sample_new_email_content = """
Hello team,
Just following up on the project update meeting. Could someone confirm the deadline for phase 2
implementation?
Also, please let me know if you need anything from my side. You can reach me at 555-1234.
Thanks,
Alice
"""
sample_sender = "Alice"
sample_subject = "Follow up on Project Update"
# 4. Call the integrated function with the sample new email
print("--- Starting New Email Processing ---")
process_new_email(sample_new_email_content, sample_sender, sample_subject)
print("--- New Email Processing Finished ---")
# 5. Optionally, read and display the updated Excel file
try:
  updated_extracted_df = pd.read_excel('extracted_email_data.xlsx')
  print("\nUpdated Excel File Content:")
  display(updated_extracted_df)
except FileNotFoundError:
  print("\nExcel file not found after processing.")