Source code for salesanalyzer_mds.predict_sales

import pandas as pd
from pandas.api.types import is_numeric_dtype
from sklearn.model_selection import train_test_split
from sklearn.compose import make_column_transformer
from sklearn.preprocessing import OneHotEncoder
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_squared_error, r2_score

[docs] def predict_sales(sales_data, new_data, numeric_features, categorical_features, target, date_feature=None, test_size=0.3): """ Predicts future sales based on the provided historical data. Parameters: ----------- sales_data: pd.DataFrame DataFrame containing historical sales data. new_data: pd.DataFrame DataFrame containing new data to predict on. numeric_features: list List of columns to use as features with numeric data type. categorical_features: list List of columns to use as features with character data type. target: str Name of the target column. date_feature: str Name of columns to use as features with datetime data type. Default: None test_size: float Proportion of data to be used for testing. Default value is 0.3 Returns: -------- pd.DataFrame: A data frame with prediction values, and a printed out MSE score and R^2 score. Examples: --------- >>> sales_data = pd.DataFrame({'name': ['laptop', 'monitor'], 'price': [100, 200], 'quantity': [2, 1]}) >>> new_data = pd.DataFrame({'name': 'laptop', 'price' : 300}) >>> numeric_features = ['price'] >>> categorical_features = ['name'] >>> target = 'quantity' >>> predict_sales(sales_data, new_data, numeric_features, categorical_features, target) MSE of the model: 1.02, Predicted values 0 245.40 """ if not isinstance(sales_data, pd.DataFrame): raise ValueError("sales_data parameter should be a pandas DataFrame") if not isinstance(new_data, pd.DataFrame): raise ValueError("new_data parameter should be a pandas DataFrame") if not isinstance(numeric_features, list): raise ValueError("numeric features should be a list") if not isinstance(categorical_features, list): raise ValueError("categorical features should be a list") if not isinstance(target, str): raise ValueError("target should be a string") for column in numeric_features: if not is_numeric_dtype(sales_data[column]): raise TypeError("numeric_features should countain numeric data type only") sales_data = sales_data.dropna() if date_feature: if not isinstance(date_feature, str): raise ValueError("date features should be a string") sales_data["year"] = pd.to_datetime(sales_data[date_feature]).dt.year sales_data["month"] = pd.to_datetime(sales_data[date_feature]).dt.month sales_data["day"] = pd.to_datetime(sales_data[date_feature]).dt.day new_data["year"] = pd.to_datetime(new_data[date_feature]).dt.year new_data["month"] = pd.to_datetime(new_data[date_feature]).dt.month new_data["day"] = pd.to_datetime(new_data[date_feature]).dt.day numeric_features.extend(["year", "month", "day"]) X = sales_data[numeric_features + categorical_features] y = sales_data[target] X_new = new_data[numeric_features + categorical_features] X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=test_size, random_state=123) preprocessor = make_column_transformer( (OneHotEncoder(handle_unknown="ignore"), categorical_features), remainder='passthrough' ) X_train = preprocessor.fit_transform(X_train) X_test = preprocessor.transform(X_test) X_new = preprocessor.transform(X_new) model = RandomForestRegressor(random_state=123) model.fit(X_train, y_train) y_pred = model.predict(X_test) mse = mean_squared_error(y_test, y_pred) r2 = r2_score(y_test, y_pred) new_pred = model.predict(X_new) print("MSE of the model:", round(mse, 2)) print("R_squared of the model:", round(r2, 2)) result = pd.DataFrame({ "Predicted values": [round(value, 2) for value in new_pred] }) return result