0% found this document useful (0 votes)
14 views3 pages

Code For Ser

The document outlines a step-by-step guide for a Speech Emotion Recognition (SER) project, including project setup, dataset collection, audio preprocessing, feature extraction, model building, and evaluation. It details the creation of necessary folders, data processing functions, and the construction of a CNN-LSTM model for emotion classification. Additionally, it includes instructions for deploying the model using Streamlit for user interaction.

Uploaded by

kentkouhcwsmk
Copyright
© © All Rights Reserved
We take content rights seriously. If you suspect this is your content, claim it here.
Available Formats
Download as TXT, PDF, TXT or read online on Scribd
0% found this document useful (0 votes)
14 views3 pages

Code For Ser

The document outlines a step-by-step guide for a Speech Emotion Recognition (SER) project, including project setup, dataset collection, audio preprocessing, feature extraction, model building, and evaluation. It details the creation of necessary folders, data processing functions, and the construction of a CNN-LSTM model for emotion classification. Additionally, it includes instructions for deploying the model using Streamlit for user interaction.

Uploaded by

kentkouhcwsmk
Copyright
© © All Rights Reserved
We take content rights seriously. If you suspect this is your content, claim it here.
Available Formats
Download as TXT, PDF, TXT or read online on Scribd
You are on page 1/ 3

# Speech Emotion Recognition (SER) Project - Step-by-Step Code

# STEP 1: Project Setup + Objective


# - Setup folders: data/, models/, utils/, notebooks/
# - Create requirements.txt with basic libraries
# - Create README.md

# STEP 2: Dataset Collection & Cleaning


import os
import shutil
import numpy as np
import pandas as pd
from tqdm import tqdm
from scipy.io import wavfile
from scipy.signal import resample
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import train_test_split
import matplotlib.pyplot as plt
from sklearn.metrics import classification_report, confusion_matrix
import seaborn as sns
import tensorflow as tf
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Conv1D, MaxPooling1D, LSTM, Dense, Dropout

def create_data_folder():
os.makedirs('data/RAVDESS', exist_ok=True)
os.makedirs('data/TESS', exist_ok=True)
os.makedirs('data/CREMA-D', exist_ok=True)
print("Folders for datasets created.")

create_data_folder()

# STEP 3: Audio Preprocessing + Feature Extraction (MFCC replacement)


SAMPLE_RATE = 22050
MAX_LEN = 5 # seconds

def extract_features(file_path):
try:
sr, audio = wavfile.read(file_path)
if audio.ndim > 1:
audio = audio[:, 0] # Convert to mono
desired_len = SAMPLE_RATE * MAX_LEN
if len(audio) > desired_len:
audio = audio[:desired_len]
elif len(audio) < desired_len:
audio = np.pad(audio, (0, desired_len - len(audio)), 'constant')
audio_resampled = resample(audio, SAMPLE_RATE * MAX_LEN)
return np.mean(audio_resampled.reshape(-1, 100), axis=1) # crude feature
except Exception as e:
print(f"Error processing {file_path}: {e}")
return None

def extract_emotion_label(filename, emotion_map):


try:
emotion_code = int(filename.split('-')[2])
return emotion_map.get(emotion_code)
except:
return None

def process_dataset(dataset_path, emotion_map):


data = []
for root, _, files in os.walk(dataset_path):
for file in tqdm(files):
if file.endswith('.wav'):
path = os.path.join(root, file)
emotion = extract_emotion_label(file, emotion_map)
if emotion:
features = extract_features(path)
if features is not None:
data.append([features, emotion])
return pd.DataFrame(data, columns=['features', 'label'])

# Emotion map for RAVDESS


ravdess_emotion_map = {
1: 'neutral', 2: 'calm', 3: 'happy', 4: 'sad',
5: 'angry', 6: 'fearful', 7: 'disgust', 8: 'surprised'
}

# df_ravdess = process_dataset('data/RAVDESS', ravdess_emotion_map)


# df_ravdess.to_pickle('data/features_ravdess.pkl')

# STEP 4: Label Encoding + Data Split

def load_all_data():
df_ravdess = pd.read_pickle('data/features_ravdess.pkl')
df_tess = pd.read_pickle('data/features_tess.pkl')
df_crema = pd.read_pickle('data/features_crema.pkl')
df_all = pd.concat([df_ravdess, df_tess, df_crema], ignore_index=True)
return df_all

data_df = load_all_data()
X = np.array(data_df['features'].tolist())
y = LabelEncoder().fit_transform(data_df['label'])

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2,


random_state=42)

# STEP 5: Model Building - CNN + LSTM


X_train_reshaped = X_train.reshape(X_train.shape[0], X_train.shape[1], 1)
X_test_reshaped = X_test.reshape(X_test.shape[0], X_test.shape[1], 1)

model = Sequential()
model.add(Conv1D(64, kernel_size=3, activation='relu',
input_shape=(X_train.shape[1], 1)))
model.add(MaxPooling1D(pool_size=2))
model.add(Dropout(0.3))
model.add(LSTM(128))
model.add(Dense(64, activation='relu'))
model.add(Dropout(0.3))
model.add(Dense(len(np.unique(y)), activation='softmax'))

model.compile(loss='sparse_categorical_crossentropy', optimizer='adam',
metrics=['accuracy'])
model.summary()
model.fit(X_train_reshaped, y_train, epochs=50, batch_size=32,
validation_split=0.1)

# STEP 6: Evaluation
y_pred = np.argmax(model.predict(X_test_reshaped), axis=1)
print(classification_report(y_test, y_pred))

cm = confusion_matrix(y_test, y_pred)
sns.heatmap(cm, annot=True, fmt='d', xticklabels=np.unique(y),
yticklabels=np.unique(y))
plt.xlabel('Predicted')
plt.ylabel('True')
plt.title('Confusion Matrix')
plt.show()

# STEP 7: App Demo (Streamlit/Gradio)


# Note: You'd use scipy.io.wavfile + resample here as well
# Save model: model.save("ser_model.h5")

# streamlit_app.py
'''
import streamlit as st
import numpy as np
from scipy.io import wavfile
from scipy.signal import resample
from tensorflow.keras.models import load_model

model = load_model("ser_model.h5")

st.title("Speech Emotion Recognizer")


uploaded_file = st.file_uploader("Upload an audio file", type=[".wav"])

if uploaded_file:
sr, audio = wavfile.read(uploaded_file)
if audio.ndim > 1:
audio = audio[:, 0]
audio = np.pad(audio, (0, max(0, sr*5 - len(audio))), 'constant')
audio = resample(audio, sr*5)
features = np.mean(audio.reshape(-1, 100), axis=1).reshape(1, -1, 1)
prediction = model.predict(features)
emotion = np.argmax(prediction)
st.write(f"Predicted Emotion: {emotion}")
'''
# Run using: streamlit run streamlit_app.py

You might also like