from abc import ABCMeta, abstractmethod
from typing import Dict, List, Optional, Tuple, TypeVar, Union
import numpy as np
import pandas as pd
from scipy import sparse as sps
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import minmax_scale
R = TypeVar("R", bound="BaseRecommender")
[docs]class BaseRecommender(metaclass=ABCMeta):
"""Defines a common interface for all recommendation models
Args:
df_interaction (pd.DataFrame): user/item interaction for train/test.
item (pd.DataFrame): side information for items.
test_size (float|int): indicates whether and how to do the test.
random_split (bool): random split or not.
user_features (pd.DataFrame): user feature columns.
item_features (pd.DataFrame): item feature columns.
"""
df_interaction: pd.DataFrame
items: pd.DataFrame
test_size: Union[float, int]
random_split: bool
user_features: Optional[pd.DataFrame]
item_features: Optional[pd.DataFrame]
def __init__(
self,
df_interaction: pd.DataFrame,
items: Optional[pd.DataFrame],
test_size: Union[float, int],
random_split: bool,
user_features: Optional[pd.DataFrame] = None,
item_features: Optional[pd.DataFrame] = None,
toppop_keep: Optional[np.ndarray] = None,
) -> None:
self.df_interaction, self.user_array, self.item_array = self.get_interaction(
df_interaction
)
self.n_users, self.n_items = self.df_interaction.max()[:2] + 1
self.items = items
self.user_features = user_features
self.item_features = item_features
self.test_size = test_size
self.random_split = random_split
self.train_mat, self.test_mat = self.process_interaction()
self.toppop = self._get_toppop(toppop_keep)
[docs] def get_interaction(
self, df_interaction: pd.DataFrame
) -> Tuple[pd.DataFrame, np.ndarray, np.ndarray]:
"""The converter for input dataframe
Args:
df_interaction(pd.DataFrame): user/item interaction matrix.
columns should be ["userId", "itemId"]
"""
dataframe = df_interaction.iloc[:, :2]
dataframe.columns = pd.Index(["userId", "itemId"])
dataframe["itemId"] = dataframe["itemId"].apply(str)
user_cat = pd.Categorical(dataframe["userId"])
item_cat = pd.Categorical(dataframe["itemId"])
dataframe["userId"] = user_cat.codes
dataframe["itemId"] = item_cat.codes
user_array = np.array(list(user_cat.categories))
item_array = np.array(list(item_cat.categories))
return dataframe, user_array, item_array
def process_interaction(self) -> Tuple[sps.coo_matrix, Optional[sps.coo_matrix]]:
dataframe = self.df_interaction
if self.random_split:
dataframe = dataframe.sample(frac=1, random_state=42).reset_index(drop=True)
self.df_interaction = dataframe
dataframe["interaction"] = 1
if self.test_size != 0:
train, test = train_test_split(
dataframe, test_size=self.test_size, shuffle=False
)
test_mat = sps.coo_matrix(
(np.ones(len(test)), (test.userId, test.itemId)),
(self.n_users, self.n_items),
"int32",
)
else:
train = dataframe
test_mat = None
train_mat = sps.coo_matrix(
(np.ones(len(train)), (train.userId, train.itemId)),
(self.n_users, self.n_items),
"int32",
)
return train_mat, test_mat
def clean_items(self) -> pd.DataFrame:
invmap = {v: k for k, v in enumerate(self.item_array)}
self.items["encodes"] = self.items["itemId"].apply(
lambda x: self._encode(x, invmap)
)
clean_items = self.items.sort_values(by=["encodes"]).dropna()
clean_items["encodes"] = clean_items["encodes"].apply(int)
return clean_items
def _encode(self, id: int, invmap: Dict) -> Optional[int]:
try:
code = invmap[id]
return int(code)
except:
return None
def fit(self: R) -> R:
self._fit()
return self
@abstractmethod
def _fit(self) -> None:
raise NotImplementedError("_fit must be implemented.")
@abstractmethod
def predict(
self,
user_ids: np.ndarray,
item_ids: np.ndarray,
user_features: Optional[sps.csr_matrix],
item_features: Optional[sps.csr_matrix],
) -> np.ndarray:
raise NotImplementedError("predict must be implemented.")
@staticmethod
def _normalize_scores(
scores: np.ndarray, feature_range: Tuple[int, int] = (0, 1)
) -> np.ndarray:
"""Get the normalized scores.
Args:
scores (np.ndarray): Scores in original scale.
feature_range (Tuple[int, int], optional): Defaults to (0, 1).
Returns:
np.ndarray: Normalized scores.
"""
scores = minmax_scale(scores, feature_range=feature_range, axis=0, copy=True)
return scores
def _get_toppop(
self, toppop_keep: Optional[np.ndarray]
) -> Tuple[np.ndarray, np.ndarray]:
"""Get the top popular indices and scores for items.
Args:
toppop_keep (Optional[np.ndarray]): The indices of items to be kept.
Returns:
Tuple[np.ndarray, np.ndarray]:
Top popular indices and the scores of items.
Return all top popular if `toppop_keep` is not assigned.
"""
scores = np.asarray(self.train_mat.sum(axis=0)).ravel()
if toppop_keep is not None:
mask = np.ones(scores.shape[0], dtype=bool)
mask[toppop_keep] = False
scores[mask] = 0
scores = self._normalize_scores(scores)
rank = (-scores).argsort()[: len(toppop_keep)]
return (rank, scores[rank])
else:
rank = (-scores).argsort()
scores = self._normalize_scores(scores)
return (rank, scores[rank])
def predict_for_userId(self, user_id: int) -> np.ndarray:
user_ids: np.ndarray = np.full(self.n_items, user_id)
item_ids: np.ndarray = np.arange(self.n_items)
prediction = self.predict(
user_ids, item_ids, self.user_features, self.item_features
)
return prediction
def predict_for_userId_unseen(self, user_id: int) -> np.ndarray:
seen = self.df_interaction[self.df_interaction["userId"] == user_id]["itemId"]
prediction = self.predict_for_userId(user_id)
prediction[seen] = -np.inf
return prediction
def predict_top_n_unseen(self, user_id: int, top_n: int) -> Dict[int, float]:
prediction = self.predict_for_userId_unseen(user_id)
argpartition = np.argpartition(-prediction, top_n)
result_args = argpartition[:top_n]
return {key: prediction[key] for key in result_args}
def predict_top_n_item(self, user_id: int, top_n: int) -> pd.DataFrame:
prediction = self.predict_top_n_unseen(user_id, top_n)
candidates: pd.DataFrame = pd.DataFrame.from_dict(prediction.items())
candidates.columns = pd.Index(["itemId", "scores"])
candidates = candidates.sort_values(
by="scores", ascending=False, ignore_index=True
)
return candidates.merge(self.items, how="left", on="itemId")
[docs] def get_user_id(self, user_string: str) -> Optional[int]:
"""Get the `user_id` for a given `user_string`.
Args:
user_string (str): Original user string.
Returns:
Optional[int]:
The index of user_id in `user_list`.
Return `None` it not in training set.
"""
try:
user_id: Optional[int] = np.where(self.user_array == user_string)[0][0]
except:
user_id = None
return user_id
[docs] def get_item_id(self, item_string: str) -> Optional[int]:
"""Get the `item_id` for a given `item_string`.
Args:
item_string (str): Original item string.
Returns:
Optional[int]:
The index of item_id in `item_list`.
Return `None` it not in training set.
"""
try:
item_id: Optional[int] = np.where(self.item_array == item_string)[0][0]
except:
item_id = None
return item_id
[docs] def get_topk_indices(self, scores: np.ndarray, top_k: int) -> np.ndarray:
"""Get the indices correspond to the topk items.
Args:
scores (np.ndarray): Scores given by the models.
top_k (int): Numbers of top items to be kept.
Returns:
np.ndarray: Indices for `top_k` items.
"""
indices = np.argpartition(scores, -top_k)[-top_k:]
sorted_indices = indices[np.argsort(scores[indices])[::-1]]
return np.asarray(sorted_indices)