Source code for rsdiv.dataset.movielens_100k

import os
from typing import List

import pandas as pd

from .base import BaseDownloader


[docs]class MovieLens100KDownLoader(BaseDownloader): """MovieLens dataset downLoader for 100K interactions.""" DOWNLOAD_URL: str = "http://files.grouplens.org/datasets/movielens/ml-100k.zip" DEFAULT_PATH: str = os.path.join(os.getcwd(), "ml-100k") def read_ratings(self) -> pd.DataFrame: ratings_path: str = os.path.join(self.DEFAULT_PATH, "u.data") df_ratings: pd.DataFrame = pd.read_csv( ratings_path, sep="\t", header=None, engine="python" ).copy() df_ratings.columns = pd.Index(["userId", "movieId", "rating", "timestamp"]) df_ratings["timestamp"] = pd.to_datetime(df_ratings.timestamp, unit="s") return df_ratings def read_users(self) -> pd.DataFrame: users_path: str = os.path.join(self.DEFAULT_PATH, "u.user") df_users: pd.DataFrame = pd.read_csv( users_path, sep="|", header=None, engine="python", names=["userId", "age", "gender", "occupation", "zipcode"], ) return df_users[["userId", "gender", "age", "occupation", "zipcode"]] def _read_genres(self) -> List[str]: genres_path: str = os.path.join(self.DEFAULT_PATH, "u.genre") with open(genres_path, "r") as outfile: genres = outfile.read() return [pair.split("|")[0] for pair in genres.split("\n")][:-2] def read_items(self) -> pd.DataFrame: movies_path: str = os.path.join(self.DEFAULT_PATH, "u.item") genres: List[str] = self._read_genres() df_items: pd.DataFrame = pd.read_csv( movies_path, sep="|", header=None, encoding="latin-1", engine="python", names=["itemId", "title", "release_date", "video_release_date", "URL"] + genres, ) df_items["title"] = df_items["title"].str[:-7] df_items["title"] = df_items["title"].apply(lambda x: x.split(",")[0]) df_items["release_date"] = pd.to_datetime(df_items.release_date) df_items["genres"] = df_items[genres] @ (df_items[genres].columns + "|") df_items["genres"] = df_items["genres"].apply(lambda x: x[:-1].split("|")) df_items = df_items.drop(columns=genres + ["video_release_date", "URL"]) return df_items[["itemId", "title", "genres", "release_date"]]