How to Generate Fake Sales Reports with Python Faker¶
Step 1: Set Up Your Environment¶
pip install pandas faker
Step 2: Create Data Generation Script (sales_data_generator.py)¶
import pandas as pd
from faker import Faker
fake = Faker()
def generate_sales_data(num_records=25000):
categories = {
"Electronics": ["Laptop", "Smartphone", "Tablet", "Headphones"],
"Furniture": ["Chair", "Table", "Sofa", "Bookshelf"],
"Clothing": ["Shirt", "Jeans", "Jacket", "Shoes"],
"Food": ["Snacks", "Beverages", "Fruits", "Vegetables"],
"Books": ["Fiction", "Non-fiction", "Textbook", "Magazine"]
}
data = {
"Transaction_ID": [fake.uuid4() for _ in range(num_records)],
"Date": [fake.date_between(start_date="-2y") for _ in range(num_records)],
"Customer_Name": [fake.name() for _ in range(num_records)],
"Region": [fake.random_element(["North", "South", "East", "West"])],
"Product_Category": [fake.random_element(list(categories.keys()))],
"Product": [fake.random_element(categories[cat]) for cat in data["Product_Category"]],
"Quantity": [fake.random_int(1, 20) for _ in range(num_records)],
"Price_per_Unit": [fake.random_element([20, 50, 100, 200, 500])]
}
df = pd.DataFrame(data)
df["Total_Sales"] = df["Quantity"] * df["Price_per_Unit"]
return df
# Generate and save data
df = generate_sales_data()
df.to_csv("sales_data.csv", index=False)
Step 3: Analyze Generated Data¶
import pandas as pd
df = pd.read_csv("sales_data.csv")
# Top performing categories
category_sales = df.groupby('Product_Category')['Total_Sales'].sum().sort_values(ascending=False)
# Regional performance
regional_stats = df.pivot_table(
index='Region',
values='Total_Sales',
aggfunc=['sum', 'mean', 'count']
)
# Anomaly detection
anomalies = df[df['Product'].isin(['Fruits', 'Vegetables']) & (df['Product_Category'] != 'Food')]
Step 4: Create Visualizations¶
import matplotlib.pyplot as plt
# Sales by category
plt.figure(figsize=(10,6))
df.groupby('Product_Category')['Total_Sales'].sum().plot(
kind='pie',
autopct='%1.1f%%',
title='Sales Distribution by Category'
)
plt.savefig('sales_by_category.png')
# Monthly sales trend
df['Month'] = pd.to_datetime(df['Date']).dt.to_period('M')
monthly_sales = df.groupby('Month')['Total_Sales'].sum()
monthly_sales.plot(kind='line', title='Monthly Sales Trend')
plt.savefig('monthly_trend.png')
Step 5: Automate Reporting¶
from datetime import datetime
def generate_report():
df = pd.read_csv("sales_data.csv")
report_date = datetime.now().strftime("%Y-%m-%d")
with open("sales_report.md", "w") as f:
f.write(f"# Sales Report - {report_date}\n\n")
f.write("## Key Metrics\n")
f.write(f"- Total Sales: ${df['Total_Sales'].sum():,}\n")
f.write(f"- Average Transaction Value: ${df['Total_Sales'].mean():.2f}\n\n")
f.write("\n")
f.write("\n")
generate_report()
graph TD
A[Generate Data] --> B[Analyze Data]
B --> C[Create Visualizations]
C --> D[Generate Markdown Report]
D --> E[Automate Process]
This guide shows how to generate realistic fake sales data for testing analytics pipelines and reporting systems. The randomized mismatches between product categories and items help test data validation systems.