# Speech Emotion Recognition (SER) Project - Step-by-Step Code
#   STEP 1: Project Setup + Objective
#   - Setup folders: data/, models/, utils/, notebooks/
#   - Create requirements.txt with basic libraries
#   - Create README.md
# STEP 2: Dataset Collection & Cleaning
import os
import shutil
import numpy as np
import pandas as pd
from tqdm import tqdm
from scipy.io import wavfile
from scipy.signal import resample
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import train_test_split
import matplotlib.pyplot as plt
from sklearn.metrics import classification_report, confusion_matrix
import seaborn as sns
import tensorflow as tf
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Conv1D, MaxPooling1D, LSTM, Dense, Dropout
def create_data_folder():
    os.makedirs('data/RAVDESS', exist_ok=True)
    os.makedirs('data/TESS', exist_ok=True)
    os.makedirs('data/CREMA-D', exist_ok=True)
    print("Folders for datasets created.")
create_data_folder()
# STEP 3: Audio Preprocessing + Feature Extraction (MFCC replacement)
SAMPLE_RATE = 22050
MAX_LEN = 5 # seconds
def extract_features(file_path):
    try:
         sr, audio = wavfile.read(file_path)
         if audio.ndim > 1:
             audio = audio[:, 0] # Convert to mono
         desired_len = SAMPLE_RATE * MAX_LEN
         if len(audio) > desired_len:
             audio = audio[:desired_len]
         elif len(audio) < desired_len:
             audio = np.pad(audio, (0, desired_len - len(audio)), 'constant')
         audio_resampled = resample(audio, SAMPLE_RATE * MAX_LEN)
         return np.mean(audio_resampled.reshape(-1, 100), axis=1) # crude feature
    except Exception as e:
         print(f"Error processing {file_path}: {e}")
         return None
def extract_emotion_label(filename, emotion_map):
    try:
         emotion_code = int(filename.split('-')[2])
         return emotion_map.get(emotion_code)
    except:
        return None
def process_dataset(dataset_path, emotion_map):
    data = []
    for root, _, files in os.walk(dataset_path):
        for file in tqdm(files):
            if file.endswith('.wav'):
                path = os.path.join(root, file)
                emotion = extract_emotion_label(file, emotion_map)
                if emotion:
                    features = extract_features(path)
                    if features is not None:
                        data.append([features, emotion])
    return pd.DataFrame(data, columns=['features', 'label'])
# Emotion map for RAVDESS
ravdess_emotion_map = {
    1: 'neutral', 2: 'calm', 3: 'happy', 4: 'sad',
    5: 'angry', 6: 'fearful', 7: 'disgust', 8: 'surprised'
}
# df_ravdess = process_dataset('data/RAVDESS', ravdess_emotion_map)
# df_ravdess.to_pickle('data/features_ravdess.pkl')
# STEP 4: Label Encoding + Data Split
def load_all_data():
    df_ravdess = pd.read_pickle('data/features_ravdess.pkl')
    df_tess = pd.read_pickle('data/features_tess.pkl')
    df_crema = pd.read_pickle('data/features_crema.pkl')
    df_all = pd.concat([df_ravdess, df_tess, df_crema], ignore_index=True)
    return df_all
data_df = load_all_data()
X = np.array(data_df['features'].tolist())
y = LabelEncoder().fit_transform(data_df['label'])
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2,
random_state=42)
# STEP 5: Model Building - CNN + LSTM
X_train_reshaped = X_train.reshape(X_train.shape[0], X_train.shape[1], 1)
X_test_reshaped = X_test.reshape(X_test.shape[0], X_test.shape[1], 1)
model = Sequential()
model.add(Conv1D(64, kernel_size=3, activation='relu',
input_shape=(X_train.shape[1], 1)))
model.add(MaxPooling1D(pool_size=2))
model.add(Dropout(0.3))
model.add(LSTM(128))
model.add(Dense(64, activation='relu'))
model.add(Dropout(0.3))
model.add(Dense(len(np.unique(y)), activation='softmax'))
model.compile(loss='sparse_categorical_crossentropy', optimizer='adam',
metrics=['accuracy'])
model.summary()
model.fit(X_train_reshaped, y_train, epochs=50, batch_size=32,
validation_split=0.1)
# STEP 6: Evaluation
y_pred = np.argmax(model.predict(X_test_reshaped), axis=1)
print(classification_report(y_test, y_pred))
cm = confusion_matrix(y_test, y_pred)
sns.heatmap(cm, annot=True, fmt='d', xticklabels=np.unique(y),
yticklabels=np.unique(y))
plt.xlabel('Predicted')
plt.ylabel('True')
plt.title('Confusion Matrix')
plt.show()
# STEP 7: App Demo (Streamlit/Gradio)
# Note: You'd use scipy.io.wavfile + resample here as well
# Save model: model.save("ser_model.h5")
# streamlit_app.py
'''
import streamlit as st
import numpy as np
from scipy.io import wavfile
from scipy.signal import resample
from tensorflow.keras.models import load_model
model = load_model("ser_model.h5")
st.title("Speech Emotion Recognizer")
uploaded_file = st.file_uploader("Upload an audio file", type=[".wav"])
if uploaded_file:
    sr, audio = wavfile.read(uploaded_file)
    if audio.ndim > 1:
        audio = audio[:, 0]
    audio = np.pad(audio, (0, max(0, sr*5 - len(audio))), 'constant')
    audio = resample(audio, sr*5)
    features = np.mean(audio.reshape(-1, 100), axis=1).reshape(1, -1, 1)
    prediction = model.predict(features)
    emotion = np.argmax(prediction)
    st.write(f"Predicted Emotion: {emotion}")
'''
# Run using: streamlit run streamlit_app.py