Skip to content

PapaGoose/RS

Folders and files

NameName
Last commit message
Last commit date

Latest commit

 

History

14 Commits
 
 
 
 
 
 

Repository files navigation

Сперва импортируем необходимые библеотеки:

import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

import winsound

import scipy.sparse as sparse

from lightfm import LightFM
from lightfm.cross_validation import random_train_test_split
from lightfm.evaluation import auc_score, precision_at_k, recall_at_k
from lightfm.data import Dataset

import sklearn
from sklearn.model_selection import train_test_split

Далее загружаем необходимые нам датасеты:

data = pd.read_csv('data/train.csv', low_memory=False)
test = pd.read_csv('data/test.csv', low_memory=False)
submission = pd.read_csv('data/sample_submission.csv')

Используем метод grid search для нахождения наилучших параметров для нашей модели:

def sample_hyperparameters():

    while True:
        yield {
            "no_components": np.random.randint(60, 120),
            "learning_schedule": np.random.choice(["adagrad", "adadelta"]),
            "learning_rate": np.random.exponential(0.05),
            "item_alpha": np.random.exponential(1e-8),
            "user_alpha": np.random.exponential(1e-8),
            "num_epochs": np.random.randint(5, 40),
        }


def random_search(train, test, num_samples=10, num_threads=1):
    """
    Sample random hyperparameters, fit a LightFM model, and evaluate it
    on the test set.

    Parameters
    ----------

    train: np.float32 coo_matrix of shape [n_users, n_items]
        Training data.
    test: np.float32 coo_matrix of shape [n_users, n_items]
        Test data.
    num_samples: int, optional
        Number of hyperparameter choices to evaluate.


    Returns
    -------

    generator of (auc_score, hyperparameter dict, fitted model)

    """

    for hyperparams in itertools.islice(sample_hyperparameters(), num_samples):
        num_epochs = hyperparams.pop("num_epochs")

        model = LightFM(**hyperparams)
        model.fit(train, epochs=num_epochs, num_threads=num_threads)
        preds = model.predict(test.userid.values,
                      test.itemid.values)
        score = sklearn.metrics.roc_auc_score(test.rating,preds)

        hyperparams["num_epochs"] = num_epochs

        yield (score, hyperparams, model)


if __name__ == "__main__":
    data = pd.read_csv('data/train.csv', low_memory=False)
    test = pd.read_csv('data/test.csv', low_memory=False)
    
    train_data, test_data = train_test_split(data,random_state=32, shuffle=True)
    #создаем разреженную матрицу user/item
    ratings_coo = sparse.coo_matrix((train_data['rating'].astype(int),
                                 (train_data['userid'],
                                  train_data['itemid'])))
                                  
    (score, hyperparams, model) = max(random_search(ratings_coo, test_data, num_threads=4, num_samples=200), key=lambda x: x[0])
    
    print("Best score {} at {}".format(score, hyperparams))

Best score 0.7540887486124561 at {'no_components': 106, 'learning_schedule': 'adagrad', 'learning_rate': 0.15345874636828602, 'item_alpha': 1.8816625826291405e-08, 'user_alpha': 1.1161645409516314e-08, 'num_epochs': 13}

После этого обучаем модель на этих параметрах и делаем предсказание:

model = LightFM(**params_new,random_state=111)
model.fit(ratings_coo, epochs=13, num_threads=4)

preds = model.predict(test.userid.values,
                      test.itemid.values)
normalized_preds = (preds - preds.min())/(preds - preds.min()).max()
submission = pd.read_csv('data/sample_submission.csv')
submission['rating'] = normalized_preds
submission.to_csv('data/new_sub.csv',index=False)

ROC AUC = 0.77466

About

No description, website, or topics provided.

Resources

Stars

Watchers

Forks

Releases

No releases published

Packages

No packages published