# Import necessary libraries
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from datetime import datetime
import warnings
warnings.filterwarnings('ignore')

# Set style for better visualizations
plt.style.use('seaborn-v0_8')
sns.set_palette("husl")

# Display settings
pd.set_option('display.max_columns', None)
pd.set_option('display.width', None)
pd.set_option('display.max_colwidth', 50)

# Load the CSV file
df = pd.read_csv('data/home_visits.csv')

# Display basic information
# print("Dataset Shape:", df.shape)
# print("\nColumn Names:", df.columns.tolist())
# print("\nData Types:")
# print(df.dtypes)

# Display first few rows
# print("\nFirst 5 rows:")
# df.head()

# Convert Requested Date to datetime
df['Requested Date'] = pd.to_datetime(df['Requested Date'])

# Extract additional date features
df['Year'] = df['Requested Date'].dt.year
df['Month'] = df['Requested Date'].dt.month
df['Month_Name'] = df['Requested Date'].dt.month_name()
df['Day'] = df['Requested Date'].dt.day
df['Weekday'] = df['Requested Date'].dt.weekday
df['Weekday_Name'] = df['Requested Date'].dt.day_name()
df['Week_Number'] = df['Requested Date'].dt.isocalendar().week
df['Quarter'] = df['Requested Date'].dt.quarter

# Extract postcode area (first part before space)
df['Postcode_Area'] = df['Postcode'].str.replace(' ', '').str[:4]

# Check for missing values
print("Missing Values:")
print(df.isnull().sum())

# Basic statistics
print("\nDataset Summary:")
print(f"Total number of visits: {len(df)}")
print(f"Date range: {df['Requested Date'].min()} to {df['Requested Date'].max()}")
print(f"Number of unique clinicians: {df['Assigned Clinician'].nunique()}")
print(f"Number of unique postcodes: {df['Postcode'].nunique()}")

Missing Values:
Address                2
Postcode               2
Telephone              5
Requested Date         0
Assigned Clinician    34
Comments              50
Year                   0
Month                  0
Month_Name             0
Day                    0
Weekday                0
Weekday_Name           0
Week_Number            0
Quarter                0
Postcode_Area          2
dtype: int64

Dataset Summary:
Total number of visits: 3112
Date range: 2020-01-10 10:51:00 to 2026-01-08 15:56:00
Number of unique clinicians: 28
Number of unique postcodes: 279

# Count visits by weekday
weekday_order = ['Monday', 'Tuesday', 'Wednesday', 'Thursday', 'Friday', 'Saturday', 'Sunday']
weekday_counts = df['Weekday_Name'].value_counts().reindex(weekday_order, fill_value=0)

# Create visualization
fig, axes = plt.subplots(1, 2, figsize=(16, 6))

# Bar chart
weekday_counts.plot(kind='bar', ax=axes[0], color='steelblue', edgecolor='black')
axes[0].set_title('Number of Visits by Weekday', fontsize=14, fontweight='bold')
axes[0].set_xlabel('Weekday', fontsize=12)
axes[0].set_ylabel('Number of Visits', fontsize=12)
axes[0].tick_params(axis='x', rotation=45)
axes[0].grid(axis='y', alpha=0.3)

# Add value labels on bars
for i, v in enumerate(weekday_counts):
    axes[0].text(i, v + max(weekday_counts)*0.01, str(v), ha='center', fontweight='bold')

# Pie chart
colors = plt.cm.Set3(range(len(weekday_counts)))
axes[1].pie(weekday_counts.values, labels=weekday_counts.index, autopct='%1.1f%%', startangle=90, colors=colors)
axes[1].set_title('Percentage Distribution by Weekday', fontsize=14, fontweight='bold')

plt.tight_layout()
plt.show()

# Display table
weekday_table = pd.DataFrame({
    'Weekday': weekday_counts.index,
    'Count': weekday_counts.values,
    'Percentage': (weekday_counts.values / len(df) * 100).round(1)
})
print("\nVisits by Weekday:")
print(weekday_table.to_string(index=False))

Visits by Weekday:
  Weekday  Count  Percentage
   Monday    849        27.3
  Tuesday    642        20.6
Wednesday    541        17.4
 Thursday    526        16.9
   Friday    554        17.8
 Saturday      0         0.0
   Sunday      0         0.0

# Monthly visits
monthly_counts = df.groupby(['Year', 'Month', 'Month_Name']).size().reset_index(name='Visit_Count')
monthly_counts = monthly_counts.sort_values(['Year', 'Month'])

# Create visualization - single bar chart
fig, ax = plt.subplots(figsize=(16, 6))

# Bar chart of monthly visits for 2025
month_order = ['January', 'February', 'March', 'April', 'May', 'June', 
              'July', 'August', 'September', 'October', 'November', 'December']
monthly_2025 = df[df['Year'] == 2025].groupby('Month_Name').size().reindex(month_order, fill_value=0)

monthly_2025.plot(kind='bar', ax=ax, color='coral', edgecolor='black')
ax.set_title('Monthly Visits (2025)', fontsize=14, fontweight='bold')
ax.set_xlabel('Month', fontsize=12)
ax.set_ylabel('Number of Visits', fontsize=12)
ax.tick_params(axis='x', rotation=45)
ax.grid(axis='y', alpha=0.3)

# Add value labels
for i, v in enumerate(monthly_2025):
    if v > 0:  # Only show label if there are visits
        ax.text(i, v + max(monthly_2025)*0.01, str(v), ha='center', fontweight='bold')

plt.tight_layout()
plt.show()

# Display detailed monthly table for 2025
print("\nMonthly Breakdown (2025):")
monthly_2025_df = monthly_counts[monthly_counts['Year'] == 2025][['Month_Name', 'Visit_Count']]
monthly_2025_df = monthly_2025_df.set_index('Month_Name').reindex(month_order, fill_value=0)
monthly_2025_df.loc['Total'] = monthly_2025_df.sum()
print(monthly_2025_df)

Monthly Breakdown (2025):
            Visit_Count
Month_Name             
January             267
February            222
March               250
April               221
May                 240
June                219
July                256
August              240
September           232
October             331
November            255
December            275
Total              3008

# Calculate percentage of 2025 visits for the requested nursing homes
nursing_homes = [
    ('The Acorns', 'WN2 3LJ'),
    ('Rosebridge and Carrington', 'WN2 3DU'),
    ('The Oaks', 'WN2 4LZ'),
    ('Thorley House', 'WN2 3QD')
]

df_2025 = df[df['Year'] == 2025].copy()
df_2025['Postcode_Clean'] = df_2025['Postcode'].fillna('').str.upper().str.replace(' ', '', regex=False)

total_2025_visits = len(df_2025)
rows = []
for name, postcode in nursing_homes:
    key = postcode.upper().replace(' ', '')
    count = int((df_2025['Postcode_Clean'] == key).sum())
    percentage = round((count / total_2025_visits * 100) if total_2025_visits else 0, 1)
    rows.append({'Nursing Home': name, 'Postcode': postcode, 'Visits': count, 'Percentage (%)': percentage})

percentage_table = pd.DataFrame(rows)
print('\nNursing Home Visit Percentages (2025):')
print(percentage_table.to_string(index=False))
print(f'\nTotal visits in 2025: {total_2025_visits}')

# Add remaining visits as Other and plot the share
other_count = total_2025_visits - percentage_table['Visits'].sum()
share_df = percentage_table[['Nursing Home', 'Visits']].copy()
other_row = pd.DataFrame([{'Nursing Home': 'Other', 'Visits': int(other_count)}])
share_df = pd.concat([share_df, other_row], ignore_index=True)
share_df['Fraction'] = share_df['Visits'] / total_2025_visits

print('\nVisit share including Other:')
print(share_df[['Nursing Home', 'Visits', 'Fraction']].to_string(index=False))

# Chart the fraction of total visits in 2025
fig, ax = plt.subplots(figsize=(10, 6))

def label_with_count(pct, allvals):
    absolute = int(round(pct/100.*sum(allvals)))
    return f"{absolute} ({pct:.1f}%)"

ax.pie(
    share_df['Visits'],
    labels=share_df['Nursing Home'],
    autopct=lambda pct: label_with_count(pct, share_df['Visits']),
    startangle=140,
    colors=plt.cm.tab20.colors
)
ax.set_title('2025 Visit Share: Selected Nursing Homes vs Other', fontsize=14, fontweight='bold')
plt.tight_layout()
plt.show()

Nursing Home Visit Percentages (2025):
             Nursing Home Postcode  Visits  Percentage (%)
               The Acorns  WN2 3LJ     296             9.8
Rosebridge and Carrington  WN2 3DU     608            20.2
                 The Oaks  WN2 4LZ     352            11.7
            Thorley House  WN2 3QD     216             7.2

Total visits in 2025: 3008

Visit share including Other:
             Nursing Home  Visits  Fraction
               The Acorns     296  0.098404
Rosebridge and Carrington     608  0.202128
                 The Oaks     352  0.117021
            Thorley House     216  0.071809
                    Other    1536  0.510638

# Visits per day
daily_counts = df.groupby(df['Requested Date'].dt.date).size().reset_index(name='Visits_Per_Day')
daily_counts = daily_counts.sort_values('Visits_Per_Day', ascending=False)

# Statistics
max_visits = daily_counts['Visits_Per_Day'].max()
min_visits = daily_counts['Visits_Per_Day'].min()
avg_visits = daily_counts['Visits_Per_Day'].mean()
median_visits = daily_counts['Visits_Per_Day'].median()

print("Daily Visit Statistics:")
print(f"Maximum visits in a single day: {max_visits}")
print(f"Minimum visits in a single day: {min_visits}")
print(f"Average visits per day: {avg_visits:.2f}")
print(f"Median visits per day: {median_visits:.2f}")

# Top 10 busiest days
print("\nTop 10 Busiest Days:")
top_10_days = daily_counts.head(10)
top_10_days['Weekday'] = pd.to_datetime(top_10_days['Requested Date']).dt.day_name()
print(top_10_days.to_string(index=False))

# Visualization of top 10 busiest days
fig, axes = plt.subplots(1, 2, figsize=(16, 6))

# Bar chart of top 10 days
colors = plt.cm.Reds(np.linspace(0.3, 1, len(top_10_days)))
bars = axes[0].barh(range(len(top_10_days)), top_10_days['Visits_Per_Day'], color=colors, edgecolor='black')
axes[0].set_yticks(range(len(top_10_days)))
axes[0].set_yticklabels([f"{d.strftime('%d %b %Y')} ({w})" for d, w in zip(pd.to_datetime(top_10_days['Requested Date']), top_10_days['Weekday'])])
axes[0].set_xlabel('Number of Visits', fontsize=12)
axes[0].set_title('Top 10 Busiest Days', fontsize=14, fontweight='bold')
axes[0].invert_yaxis()
axes[0].grid(axis='x', alpha=0.3)

# Add value labels
for i, v in enumerate(top_10_days['Visits_Per_Day']):
    axes[0].text(v + max(top_10_days['Visits_Per_Day'])*0.02, i, str(v), va='center', fontweight='bold')

# Distribution of daily visits
axes[1].hist(daily_counts['Visits_Per_Day'], bins=20, color='skyblue', edgecolor='black', alpha=0.7)
axes[1].axvline(avg_visits, color='red', linestyle='--', linewidth=2, label=f'Mean: {avg_visits:.1f}')
axes[1].axvline(median_visits, color='green', linestyle='--', linewidth=2, label=f'Median: {median_visits:.1f}')
axes[1].set_xlabel('Number of Visits per Day', fontsize=12)
axes[1].set_ylabel('Frequency', fontsize=12)
axes[1].set_title('Distribution of Daily Visits', fontsize=14, fontweight='bold')
axes[1].legend()
axes[1].grid(alpha=0.3)

plt.tight_layout()
plt.show()

Daily Visit Statistics:
Maximum visits in a single day: 29
Minimum visits in a single day: 1
Average visits per day: 11.19
Median visits per day: 11.00

Top 10 Busiest Days:
Requested Date  Visits_Per_Day Weekday
    2025-05-27              29 Tuesday
    2025-02-10              26  Monday
    2025-12-29              26  Monday
    2025-09-29              26  Monday
    2025-07-21              25  Monday
    2025-03-31              24  Monday
    2025-09-08              24  Monday
    2025-11-24              23  Monday
    2026-01-05              23  Monday
    2025-10-13              22  Monday

# Count visits by postcode area
postcode_counts = df['Postcode_Area'].value_counts().sort_values(ascending=False)
postcode_percentage = (postcode_counts / len(df) * 100).round(1)

# Only show top 15 for visualization
top_postcodes = postcode_counts.head(15)

# Create visualization
fig, axes = plt.subplots(1, 2, figsize=(16, 6))

# Bar chart of top 15 postcode areas
colors = plt.cm.plasma(np.linspace(0, 1, len(top_postcodes)))
bars = axes[0].bar(range(len(top_postcodes)), top_postcodes.values, color=colors, edgecolor='black')
axes[0].set_xticks(range(len(top_postcodes)))
axes[0].set_xticklabels(top_postcodes.index, rotation=45, ha='right')
axes[0].set_ylabel('Number of Visits', fontsize=12)
axes[0].set_title('Top 15 Postcode Areas by Visit Count', fontsize=14, fontweight='bold')
axes[0].grid(axis='y', alpha=0.3)

# Add value labels
for i, v in enumerate(top_postcodes.values):
    axes[0].text(i, v + max(top_postcodes.values)*0.02, str(v), ha='center', fontweight='bold', fontsize=8)

# Pie chart of top 10
top_10_postcodes = postcode_counts.head(10)
axes[1].pie(top_10_postcodes.values, labels=top_10_postcodes.index, autopct='%1.1f%%', startangle=90)
axes[1].set_title('Top 10 Postcode Areas (Percentage)', fontsize=14, fontweight='bold')

plt.tight_layout()
plt.show()

# Display table
postcode_table = pd.DataFrame({
    'Postcode Area': postcode_counts.index,
    'Number of Visits': postcode_counts.values,
    'Percentage': postcode_percentage.values
})
print("\nVisits by Postcode Area (Top 20):")
print(postcode_table.head(20).to_string(index=False))

print(f"\nTotal unique postcode areas: {len(postcode_counts)}")

Visits by Postcode Area (Top 20):
Postcode Area  Number of Visits  Percentage
         WN23              1576        50.6
         WN24              1166        37.5
         WN22               280         9.0
         WN75                28         0.9
         BL52                15         0.5
         WN34                13         0.4
         WN25                12         0.4
         WN21                12         0.4
         M460                 3         0.1
         WN13                 2         0.1
         WN72                 1         0.0
         BL64                 1         0.0
         WN49                 1         0.0

Total unique postcode areas: 13

# Count visits by quarter and year
quarterly_counts = df.groupby(['Year', 'Quarter']).size().reset_index(name='Visit_Count')
quarterly_counts = quarterly_counts.sort_values(['Year', 'Quarter'])
quarterly_counts['Quarter_Label'] = 'Q' + quarterly_counts['Quarter'].astype(str)

# Create visualization
fig, axes = plt.subplots(1, 2, figsize=(16, 6))

# Stacked bar chart by year
years = df['Year'].unique()
quarters = [1, 2, 3, 4]
bottom = np.zeros(len(years))

colors_quarters = ['#ff6b6b', '#4ecdc4', '#45b7d1', '#96ceb4']

for i, quarter in enumerate(quarters):
    values = [quarterly_counts[(quarterly_counts['Year'] == year) & (quarterly_counts['Quarter'] == quarter)]['Visit_Count'].values[0] if len(quarterly_counts[(quarterly_counts['Year'] == year) & (quarterly_counts['Quarter'] == quarter)]) > 0 else 0 for year in years]
    axes[0].bar(years, values, bottom=bottom, label=f'Q{quarter}', color=colors_quarters[i], edgecolor='black')
    bottom = bottom + np.array(values)

axes[0].set_xlabel('Year', fontsize=12)
axes[0].set_ylabel('Number of Visits', fontsize=12)
axes[0].set_title('Quarterly Visits by Year', fontsize=14, fontweight='bold')
axes[0].legend(title='Quarter')
axes[0].grid(axis='y', alpha=0.3)

# Bar chart of total quarterly visits (aggregated)
total_quarterly = df.groupby('Quarter').size().reindex([1, 2, 3, 4], fill_value=0)
quarter_labels = ['Q1', 'Q2', 'Q3', 'Q4']
axes[1].bar(quarter_labels, total_quarterly.values, color=colors_quarters, edgecolor='black')
axes[1].set_xlabel('Quarter', fontsize=12)
axes[1].set_ylabel('Total Number of Visits', fontsize=12)
axes[1].set_title('Total Visits by Quarter (All Years)', fontsize=14, fontweight='bold')
axes[1].grid(axis='y', alpha=0.3)

# Add value labels
for i, v in enumerate(total_quarterly.values):
    axes[1].text(i, v + max(total_quarterly.values)*0.02, str(v), ha='center', fontweight='bold')

plt.tight_layout()
plt.show()

# Display table
quarterly_pivot = quarterly_counts.pivot(index='Quarter', columns='Year', values='Visit_Count').fillna(0).astype(int)
quarterly_pivot = quarterly_pivot.reindex([1, 2, 3, 4])
quarterly_pivot.index = ['Q1', 'Q2', 'Q3', 'Q4']
quarterly_pivot['Total'] = quarterly_pivot.sum(axis=1)
quarterly_pivot.loc['Total'] = quarterly_pivot.sum(axis=0)
print("\nQuarterly Breakdown:")
print(quarterly_pivot)

Quarterly Breakdown:
Year   2020  2021  2023  2024  2025  2026  Total
Q1        1     0     0     3   739    84    827
Q2        0     0     0     0   680     0    680
Q3        0     2     1     6   728     0    737
Q4        1     0     1     5   861     0    868
Total     2     2     2    14  3008    84   3112

# Filter data to only include 2025
df_2025 = df[df['Year'] == 2025].copy()

# Create time series data
df_2025['Date'] = df_2025['Requested Date'].dt.date
time_series = df_2025.groupby('Date').size().reset_index(name='Visits')
time_series['Date'] = pd.to_datetime(time_series['Date'])
time_series = time_series.sort_values('Date')

# Calculate moving average (7-day and 30-day)
time_series['MA_7'] = time_series['Visits'].rolling(window=7, min_periods=1).mean()
time_series['MA_30'] = time_series['Visits'].rolling(window=30, min_periods=1).mean()

# Create visualization
fig, axes = plt.subplots(2, 1, figsize=(16, 12))

# Daily visits with moving averages
axes[0].plot(time_series['Date'], time_series['Visits'], alpha=0.3, label='Daily Visits', color='gray')
axes[0].plot(time_series['Date'], time_series['MA_7'], label='7-Day Moving Average', linewidth=2, color='blue')
axes[0].plot(time_series['Date'], time_series['MA_30'], label='30-Day Moving Average', linewidth=2, color='red')
axes[0].set_xlabel('Date', fontsize=12)
axes[0].set_ylabel('Number of Visits', fontsize=12)
axes[0].set_title('Daily Visits with Moving Averages', fontsize=14, fontweight='bold')
axes[0].legend()
axes[0].grid(True, alpha=0.3)
axes[0].tick_params(axis='x', rotation=45)

# Monthly trend line
monthly_trend = df_2025.groupby(['Year', 'Month', 'Month_Name']).size().reset_index(name='Visits')
monthly_trend = monthly_trend.sort_values(['Year', 'Month'])
monthly_trend['Period'] = monthly_trend['Year'].astype(str) + '-' + monthly_trend['Month'].astype(str).str.zfill(2)

axes[1].plot(range(len(monthly_trend)), monthly_trend['Visits'], marker='o', linewidth=2, markersize=6, color='darkgreen')
axes[1].set_xticks(range(0, len(monthly_trend), max(1, len(monthly_trend)//12)))
axes[1].set_xticklabels(monthly_trend['Period'][::max(1, len(monthly_trend)//12)], rotation=45, ha='right')
axes[1].set_xlabel('Period (Year-Month)', fontsize=12)
axes[1].set_ylabel('Number of Visits', fontsize=12)
axes[1].set_title('Monthly Visit Trend', fontsize=14, fontweight='bold')
axes[1].grid(True, alpha=0.3)

# Add trend line
z = np.polyfit(range(len(monthly_trend)), monthly_trend['Visits'], 1)
p = np.poly1d(z)
axes[1].plot(range(len(monthly_trend)), p(range(len(monthly_trend))), 
             "r--", linewidth=2, label=f'Trend (y={z[0]:.2f}x+{z[1]:.2f})')
axes[1].legend()

plt.tight_layout()
plt.show()

# Calculate trend statistics
if len(monthly_trend) > 1:
    first_month = monthly_trend['Visits'].iloc[0]
    last_month = monthly_trend['Visits'].iloc[-1]
    percent_change = ((last_month - first_month) / first_month) * 100
    
    print("\nTrend Analysis:")
    print(f"First month visits: {first_month}")
    print(f"Last month visits: {last_month}")
    print(f"Absolute change: {last_month - first_month:+}")
    print(f"Percentage change: {percent_change:+.1f}%")
    print(f"Trend slope: {z[0]:.2f} visits per month")

Trend Analysis:
First month visits: 267
Last month visits: 275
Absolute change: +8
Percentage change: +3.0%
Trend slope: 3.65 visits per month

# Bar chart of the top 4 full postcodes
full_postcode_counts = df['Postcode'].value_counts().head(4)
colors = plt.cm.Set3(np.linspace(0, 1, len(full_postcode_counts)))
plt.figure(figsize=(12, 6))
bars = plt.bar(range(len(full_postcode_counts)), full_postcode_counts.values, color=colors, edgecolor='black')
plt.xticks(range(len(full_postcode_counts)), full_postcode_counts.index, rotation=45, ha='right')
plt.ylabel('Number of Visits', fontsize=12)
plt.title('Top 10 Full Postcodes by Visit Count', fontsize=14, fontweight='bold')
plt.grid(axis='y', alpha=0.3)

# Add value labels
for i, v in enumerate(full_postcode_counts.values):
    plt.text(i, v + max(full_postcode_counts.values)*0.02, str(v), ha='center', fontweight='bold', fontsize=8)

plt.tight_layout()
plt.show()

Home Visits Analysis¶

Data Overview¶

Load Data¶

Data Cleaning and Preprocessing¶

1. Visits by Weekday¶

2. Monthly Analysis¶

2.1 Nursing Home Visit Percentage Summary¶

3. Maximum Visits Analysis¶

5. Geographical Analysis by Postcode¶

6. Quarterly Analysis¶

8. Trend Analysis Over Time¶

Most Common Full Postcodes¶