Source code for rsdiv.dataset.movielens_1m

import os

import pandas as pd

from .base import BaseDownloader


[docs]class MovieLens1MDownLoader(BaseDownloader): """MovieLens dataset downLoader for 1M interactions.""" DOWNLOAD_URL: str = "http://files.grouplens.org/datasets/movielens/ml-1m.zip" DEFAULT_PATH: str = os.path.join(os.getcwd(), "ml-1m") def read_ratings(self) -> pd.DataFrame: ratings_path: str = os.path.join(self.DEFAULT_PATH, "ratings.dat") df_ratings: pd.DataFrame = pd.read_csv( ratings_path, sep="::", header=None, engine="python" ).copy() df_ratings.columns = pd.Index(["userId", "movieId", "rating", "timestamp"]) df_ratings["timestamp"] = pd.to_datetime(df_ratings.timestamp, unit="s") return df_ratings def read_users(self) -> pd.DataFrame: users_path: str = os.path.join(self.DEFAULT_PATH, "users.dat") df_users: pd.DataFrame = pd.read_csv( users_path, sep="::", header=None, engine="python", names=["userId", "gender", "age", "occupation", "zipcode"], ) return df_users def read_items(self) -> pd.DataFrame: movies_path: str = os.path.join(self.DEFAULT_PATH, "movies.dat") df_items: pd.DataFrame = pd.read_csv( movies_path, sep="::", header=None, encoding="latin-1", engine="python", names=["itemId", "title", "genres"], ) df_items["release_date"] = df_items["title"].str[-5:-1].astype("int") df_items["title"] = df_items["title"].str[:-7] df_items["title"] = df_items["title"].apply(lambda x: x.split(",")[0]) df_items["genres"] = df_items["genres"].apply(lambda x: x.split("|")) return df_items