Source code for salesanalyzer_mds.sales_summary_statistics

import pandas as pd


[docs] def sales_summary_statistics(sales_data: pd.DataFrame, quantity_col: str = 'Quantity', price_col: str = 'UnitPrice', customer_col: str = 'CustomerID', invoice_col: str = 'InvoiceNo', description_col: str = 'Description') -> pd.DataFrame: """ Generate summary statistics for sales data, including total revenue, unique customers, average order value, top-selling products by quantity and revenue, and average revenue per customer. Args: ----- sales_data (pandas.DataFrame): A DataFrame containing sales data with at least the following columns: quantity_col, price_col, customer_col, invoice_col, and description_col. quantity_col (str): The name of the column containing the quantity sold. price_col (str): The name of the column containing the unit price of the product. customer_col (str): The name of the column containing the customer ID. invoice_col (str): The name of the column containing the invoice number. description_col (str): The name of the column containing the product description. Returns: -------- pandas.DataFrame: A DataFrame containing the calculated summary statistics. If no sales data is provided, returns an empty DataFrame. The function computes the following statistics: - 'total_revenue': The total revenue generated by all sales. - 'unique_customers': The number of unique customers. - 'average_order_value': The average value of an order (sum of revenue per invoice). - 'top_selling_product_quantity': The product with the highest quantity sold. - 'top_selling_product_revenue': The product with the highest total revenue. - 'average_revenue_per_customer': The average revenue generated by each customer. Example: -------- >>> df = pd.DataFrame({ >>> 'Quantity': [10, 5, 3, 15], >>> 'UnitPrice': [100, 200, 150, 100], >>> 'CustomerID': [1, 2, 1, 3], >>> 'InvoiceNo': ['INV001', 'INV002', 'INV003', 'INV004'], >>> 'Description': ['Product A', 'Product B', 'Product A', 'Product C'] >>> }) >>> summary_df = sales_summary_statistics(df) >>> print(summary_df) total_revenue unique_customers average_order_value \ 0 3950.0 3 987.5 top_selling_product_quantity top_selling_product_revenue \ 0 Product C Product C average_revenue_per_customer 0 1316.666667 """ if not isinstance(sales_data, pd.DataFrame): raise ValueError("sales_data parameter should be a pandas DataFrame") # Check for missing or incorrect columns required_columns = [quantity_col, price_col, customer_col, invoice_col, description_col] missing_columns = [ col for col in required_columns if col not in sales_data.columns] if missing_columns: raise ValueError( f"Missing required columns: {', '.join(missing_columns)}") if len(sales_data.index) == 0: return pd.DataFrame({}) # Calculate revenue sales_data['Revenue'] = sales_data[quantity_col] * sales_data[price_col] sales_data['Revenue'].astype(float) # Calculate summary statistics unique_customers = sales_data[customer_col].nunique() total_revenue = sales_data['Revenue'].sum().astype(float) # Calculate top-selling products based on quantity top_selling_product_quantity = ( sales_data.groupby(description_col)[quantity_col].sum()) max_quantity = top_selling_product_quantity.max() top_selling_product_quantity = ( top_selling_product_quantity[top_selling_product_quantity == max_quantity].index.tolist()) # Calculate top-selling products based on revenue top_selling_product_revenue = ( sales_data.groupby(description_col)['Revenue'].sum()) max_revenue = top_selling_product_revenue.max() top_selling_product_revenue = ( top_selling_product_revenue[top_selling_product_revenue == max_revenue].index.tolist()) average_order_value = sales_data.groupby( invoice_col)['Revenue'].sum().mean() revenue_per_customer = sales_data.groupby(customer_col)['Revenue'].sum() average_revenue_per_customer = revenue_per_customer.mean() # Create summary statistics summary = { 'total_revenue': [total_revenue], 'unique_customers': [unique_customers], 'average_order_value': [average_order_value], 'top_selling_product_quantity': [', '.join(top_selling_product_quantity)], 'top_selling_product_revenue': [', '.join(top_selling_product_revenue)], 'average_revenue_per_customer': [average_revenue_per_customer] } # Transpose and rename the column summary_t = pd.DataFrame(summary).T.rename(columns={0: "Value"}) return summary_t