Home Visits Analysis¶
This notebook provides a comprehensive analysis of home visits from a GP surgery.
Data Overview¶
The dataset contains the following columns:
- Address: Patient's address
- Postcode: Patient's postcode
- Telephone: Patient's telephone number
- Requested Date: Date when the visit was requested
- Assigned Clinician: Clinician assigned to the visit
- Comments: Any additional comments
In [1]:
# Import necessary libraries
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from datetime import datetime
import warnings
warnings.filterwarnings('ignore')
# Set style for better visualizations
plt.style.use('seaborn-v0_8')
sns.set_palette("husl")
# Display settings
pd.set_option('display.max_columns', None)
pd.set_option('display.width', None)
pd.set_option('display.max_colwidth', 50)
Load Data¶
Loading the home visits dataset from CSV file.
In [2]:
# Load the CSV file
df = pd.read_csv('data/home_visits.csv')
# Display basic information
# print("Dataset Shape:", df.shape)
# print("\nColumn Names:", df.columns.tolist())
# print("\nData Types:")
# print(df.dtypes)
# Display first few rows
# print("\nFirst 5 rows:")
# df.head()
Data Cleaning and Preprocessing¶
In [3]:
# Convert Requested Date to datetime
df['Requested Date'] = pd.to_datetime(df['Requested Date'])
# Extract additional date features
df['Year'] = df['Requested Date'].dt.year
df['Month'] = df['Requested Date'].dt.month
df['Month_Name'] = df['Requested Date'].dt.month_name()
df['Day'] = df['Requested Date'].dt.day
df['Weekday'] = df['Requested Date'].dt.weekday
df['Weekday_Name'] = df['Requested Date'].dt.day_name()
df['Week_Number'] = df['Requested Date'].dt.isocalendar().week
df['Quarter'] = df['Requested Date'].dt.quarter
# Extract postcode area (first part before space)
df['Postcode_Area'] = df['Postcode'].str.replace(' ', '').str[:4]
# Check for missing values
print("Missing Values:")
print(df.isnull().sum())
# Basic statistics
print("\nDataset Summary:")
print(f"Total number of visits: {len(df)}")
print(f"Date range: {df['Requested Date'].min()} to {df['Requested Date'].max()}")
print(f"Number of unique clinicians: {df['Assigned Clinician'].nunique()}")
print(f"Number of unique postcodes: {df['Postcode'].nunique()}")
Missing Values: Address 2 Postcode 2 Telephone 5 Requested Date 0 Assigned Clinician 34 Comments 50 Year 0 Month 0 Month_Name 0 Day 0 Weekday 0 Weekday_Name 0 Week_Number 0 Quarter 0 Postcode_Area 2 dtype: int64 Dataset Summary: Total number of visits: 3112 Date range: 2020-01-10 10:51:00 to 2026-01-08 15:56:00 Number of unique clinicians: 28 Number of unique postcodes: 279
1. Visits by Weekday¶
Distribution of home visits across different days of the week.
In [4]:
# Count visits by weekday
weekday_order = ['Monday', 'Tuesday', 'Wednesday', 'Thursday', 'Friday', 'Saturday', 'Sunday']
weekday_counts = df['Weekday_Name'].value_counts().reindex(weekday_order, fill_value=0)
# Create visualization
fig, axes = plt.subplots(1, 2, figsize=(16, 6))
# Bar chart
weekday_counts.plot(kind='bar', ax=axes[0], color='steelblue', edgecolor='black')
axes[0].set_title('Number of Visits by Weekday', fontsize=14, fontweight='bold')
axes[0].set_xlabel('Weekday', fontsize=12)
axes[0].set_ylabel('Number of Visits', fontsize=12)
axes[0].tick_params(axis='x', rotation=45)
axes[0].grid(axis='y', alpha=0.3)
# Add value labels on bars
for i, v in enumerate(weekday_counts):
axes[0].text(i, v + max(weekday_counts)*0.01, str(v), ha='center', fontweight='bold')
# Pie chart
colors = plt.cm.Set3(range(len(weekday_counts)))
axes[1].pie(weekday_counts.values, labels=weekday_counts.index, autopct='%1.1f%%', startangle=90, colors=colors)
axes[1].set_title('Percentage Distribution by Weekday', fontsize=14, fontweight='bold')
plt.tight_layout()
plt.show()
# Display table
weekday_table = pd.DataFrame({
'Weekday': weekday_counts.index,
'Count': weekday_counts.values,
'Percentage': (weekday_counts.values / len(df) * 100).round(1)
})
print("\nVisits by Weekday:")
print(weekday_table.to_string(index=False))
Visits by Weekday: Weekday Count Percentage Monday 849 27.3 Tuesday 642 20.6 Wednesday 541 17.4 Thursday 526 16.9 Friday 554 17.8 Saturday 0 0.0 Sunday 0 0.0
2. Monthly Analysis¶
Breakdown of visits by month and year.
In [5]:
# Monthly visits
monthly_counts = df.groupby(['Year', 'Month', 'Month_Name']).size().reset_index(name='Visit_Count')
monthly_counts = monthly_counts.sort_values(['Year', 'Month'])
# Create visualization - single bar chart
fig, ax = plt.subplots(figsize=(16, 6))
# Bar chart of monthly visits for 2025
month_order = ['January', 'February', 'March', 'April', 'May', 'June',
'July', 'August', 'September', 'October', 'November', 'December']
monthly_2025 = df[df['Year'] == 2025].groupby('Month_Name').size().reindex(month_order, fill_value=0)
monthly_2025.plot(kind='bar', ax=ax, color='coral', edgecolor='black')
ax.set_title('Monthly Visits (2025)', fontsize=14, fontweight='bold')
ax.set_xlabel('Month', fontsize=12)
ax.set_ylabel('Number of Visits', fontsize=12)
ax.tick_params(axis='x', rotation=45)
ax.grid(axis='y', alpha=0.3)
# Add value labels
for i, v in enumerate(monthly_2025):
if v > 0: # Only show label if there are visits
ax.text(i, v + max(monthly_2025)*0.01, str(v), ha='center', fontweight='bold')
plt.tight_layout()
plt.show()
# Display detailed monthly table for 2025
print("\nMonthly Breakdown (2025):")
monthly_2025_df = monthly_counts[monthly_counts['Year'] == 2025][['Month_Name', 'Visit_Count']]
monthly_2025_df = monthly_2025_df.set_index('Month_Name').reindex(month_order, fill_value=0)
monthly_2025_df.loc['Total'] = monthly_2025_df.sum()
print(monthly_2025_df)
Monthly Breakdown (2025):
Visit_Count
Month_Name
January 267
February 222
March 250
April 221
May 240
June 219
July 256
August 240
September 232
October 331
November 255
December 275
Total 3008
2.1 Nursing Home Visit Percentage Summary¶
This table shows the percentage of total 2025 visits for the specified nursing home postcodes.
In [11]:
# Calculate percentage of 2025 visits for the requested nursing homes
nursing_homes = [
('The Acorns', 'WN2 3LJ'),
('Rosebridge and Carrington', 'WN2 3DU'),
('The Oaks', 'WN2 4LZ'),
('Thorley House', 'WN2 3QD')
]
df_2025 = df[df['Year'] == 2025].copy()
df_2025['Postcode_Clean'] = df_2025['Postcode'].fillna('').str.upper().str.replace(' ', '', regex=False)
total_2025_visits = len(df_2025)
rows = []
for name, postcode in nursing_homes:
key = postcode.upper().replace(' ', '')
count = int((df_2025['Postcode_Clean'] == key).sum())
percentage = round((count / total_2025_visits * 100) if total_2025_visits else 0, 1)
rows.append({'Nursing Home': name, 'Postcode': postcode, 'Visits': count, 'Percentage (%)': percentage})
percentage_table = pd.DataFrame(rows)
print('\nNursing Home Visit Percentages (2025):')
print(percentage_table.to_string(index=False))
print(f'\nTotal visits in 2025: {total_2025_visits}')
# Add remaining visits as Other and plot the share
other_count = total_2025_visits - percentage_table['Visits'].sum()
share_df = percentage_table[['Nursing Home', 'Visits']].copy()
other_row = pd.DataFrame([{'Nursing Home': 'Other', 'Visits': int(other_count)}])
share_df = pd.concat([share_df, other_row], ignore_index=True)
share_df['Fraction'] = share_df['Visits'] / total_2025_visits
print('\nVisit share including Other:')
print(share_df[['Nursing Home', 'Visits', 'Fraction']].to_string(index=False))
# Chart the fraction of total visits in 2025
fig, ax = plt.subplots(figsize=(10, 6))
def label_with_count(pct, allvals):
absolute = int(round(pct/100.*sum(allvals)))
return f"{absolute} ({pct:.1f}%)"
ax.pie(
share_df['Visits'],
labels=share_df['Nursing Home'],
autopct=lambda pct: label_with_count(pct, share_df['Visits']),
startangle=140,
colors=plt.cm.tab20.colors
)
ax.set_title('2025 Visit Share: Selected Nursing Homes vs Other', fontsize=14, fontweight='bold')
plt.tight_layout()
plt.show()
Nursing Home Visit Percentages (2025):
Nursing Home Postcode Visits Percentage (%)
The Acorns WN2 3LJ 296 9.8
Rosebridge and Carrington WN2 3DU 608 20.2
The Oaks WN2 4LZ 352 11.7
Thorley House WN2 3QD 216 7.2
Total visits in 2025: 3008
Visit share including Other:
Nursing Home Visits Fraction
The Acorns 296 0.098404
Rosebridge and Carrington 608 0.202128
The Oaks 352 0.117021
Thorley House 216 0.071809
Other 1536 0.510638
3. Maximum Visits Analysis¶
Analysis of peak visit days and time periods.
In [ ]:
# Visits per day
daily_counts = df.groupby(df['Requested Date'].dt.date).size().reset_index(name='Visits_Per_Day')
daily_counts = daily_counts.sort_values('Visits_Per_Day', ascending=False)
# Statistics
max_visits = daily_counts['Visits_Per_Day'].max()
min_visits = daily_counts['Visits_Per_Day'].min()
avg_visits = daily_counts['Visits_Per_Day'].mean()
median_visits = daily_counts['Visits_Per_Day'].median()
print("Daily Visit Statistics:")
print(f"Maximum visits in a single day: {max_visits}")
print(f"Minimum visits in a single day: {min_visits}")
print(f"Average visits per day: {avg_visits:.2f}")
print(f"Median visits per day: {median_visits:.2f}")
# Top 10 busiest days
print("\nTop 10 Busiest Days:")
top_10_days = daily_counts.head(10)
top_10_days['Weekday'] = pd.to_datetime(top_10_days['Requested Date']).dt.day_name()
print(top_10_days.to_string(index=False))
# Visualization of top 10 busiest days
fig, axes = plt.subplots(1, 2, figsize=(16, 6))
# Bar chart of top 10 days
colors = plt.cm.Reds(np.linspace(0.3, 1, len(top_10_days)))
bars = axes[0].barh(range(len(top_10_days)), top_10_days['Visits_Per_Day'], color=colors, edgecolor='black')
axes[0].set_yticks(range(len(top_10_days)))
axes[0].set_yticklabels([f"{d.strftime('%d %b %Y')} ({w})" for d, w in zip(pd.to_datetime(top_10_days['Requested Date']), top_10_days['Weekday'])])
axes[0].set_xlabel('Number of Visits', fontsize=12)
axes[0].set_title('Top 10 Busiest Days', fontsize=14, fontweight='bold')
axes[0].invert_yaxis()
axes[0].grid(axis='x', alpha=0.3)
# Add value labels
for i, v in enumerate(top_10_days['Visits_Per_Day']):
axes[0].text(v + max(top_10_days['Visits_Per_Day'])*0.02, i, str(v), va='center', fontweight='bold')
# Distribution of daily visits
axes[1].hist(daily_counts['Visits_Per_Day'], bins=20, color='skyblue', edgecolor='black', alpha=0.7)
axes[1].axvline(avg_visits, color='red', linestyle='--', linewidth=2, label=f'Mean: {avg_visits:.1f}')
axes[1].axvline(median_visits, color='green', linestyle='--', linewidth=2, label=f'Median: {median_visits:.1f}')
axes[1].set_xlabel('Number of Visits per Day', fontsize=12)
axes[1].set_ylabel('Frequency', fontsize=12)
axes[1].set_title('Distribution of Daily Visits', fontsize=14, fontweight='bold')
axes[1].legend()
axes[1].grid(alpha=0.3)
plt.tight_layout()
plt.show()
Daily Visit Statistics:
Maximum visits in a single day: 29
Minimum visits in a single day: 1
Average visits per day: 11.19
Median visits per day: 11.00
Top 10 Busiest Days:
Requested Date Visits_Per_Day Weekday
2025-05-27 29 Tuesday
2025-02-10 26 Monday
2025-12-29 26 Monday
2025-09-29 26 Monday
2025-07-21 25 Monday
2025-03-31 24 Monday
2025-09-08 24 Monday
2025-11-24 23 Monday
2026-01-05 23 Monday
2025-10-13 22 Monday
5. Geographical Analysis by Postcode¶
Analysis of visits by postcode areas.
In [ ]:
# Count visits by postcode area
postcode_counts = df['Postcode_Area'].value_counts().sort_values(ascending=False)
postcode_percentage = (postcode_counts / len(df) * 100).round(1)
# Only show top 15 for visualization
top_postcodes = postcode_counts.head(15)
# Create visualization
fig, axes = plt.subplots(1, 2, figsize=(16, 6))
# Bar chart of top 15 postcode areas
colors = plt.cm.plasma(np.linspace(0, 1, len(top_postcodes)))
bars = axes[0].bar(range(len(top_postcodes)), top_postcodes.values, color=colors, edgecolor='black')
axes[0].set_xticks(range(len(top_postcodes)))
axes[0].set_xticklabels(top_postcodes.index, rotation=45, ha='right')
axes[0].set_ylabel('Number of Visits', fontsize=12)
axes[0].set_title('Top 15 Postcode Areas by Visit Count', fontsize=14, fontweight='bold')
axes[0].grid(axis='y', alpha=0.3)
# Add value labels
for i, v in enumerate(top_postcodes.values):
axes[0].text(i, v + max(top_postcodes.values)*0.02, str(v), ha='center', fontweight='bold', fontsize=8)
# Pie chart of top 10
top_10_postcodes = postcode_counts.head(10)
axes[1].pie(top_10_postcodes.values, labels=top_10_postcodes.index, autopct='%1.1f%%', startangle=90)
axes[1].set_title('Top 10 Postcode Areas (Percentage)', fontsize=14, fontweight='bold')
plt.tight_layout()
plt.show()
# Display table
postcode_table = pd.DataFrame({
'Postcode Area': postcode_counts.index,
'Number of Visits': postcode_counts.values,
'Percentage': postcode_percentage.values
})
print("\nVisits by Postcode Area (Top 20):")
print(postcode_table.head(20).to_string(index=False))
print(f"\nTotal unique postcode areas: {len(postcode_counts)}")
Visits by Postcode Area (Top 20):
Postcode Area Number of Visits Percentage
WN23 1576 50.6
WN24 1166 37.5
WN22 280 9.0
WN75 28 0.9
BL52 15 0.5
WN34 13 0.4
WN25 12 0.4
WN21 12 0.4
M460 3 0.1
WN13 2 0.1
WN72 1 0.0
BL64 1 0.0
WN49 1 0.0
Total unique postcode areas: 13
6. Quarterly Analysis¶
Breakdown of visits by quarter.
In [ ]:
# Count visits by quarter and year
quarterly_counts = df.groupby(['Year', 'Quarter']).size().reset_index(name='Visit_Count')
quarterly_counts = quarterly_counts.sort_values(['Year', 'Quarter'])
quarterly_counts['Quarter_Label'] = 'Q' + quarterly_counts['Quarter'].astype(str)
# Create visualization
fig, axes = plt.subplots(1, 2, figsize=(16, 6))
# Stacked bar chart by year
years = df['Year'].unique()
quarters = [1, 2, 3, 4]
bottom = np.zeros(len(years))
colors_quarters = ['#ff6b6b', '#4ecdc4', '#45b7d1', '#96ceb4']
for i, quarter in enumerate(quarters):
values = [quarterly_counts[(quarterly_counts['Year'] == year) & (quarterly_counts['Quarter'] == quarter)]['Visit_Count'].values[0] if len(quarterly_counts[(quarterly_counts['Year'] == year) & (quarterly_counts['Quarter'] == quarter)]) > 0 else 0 for year in years]
axes[0].bar(years, values, bottom=bottom, label=f'Q{quarter}', color=colors_quarters[i], edgecolor='black')
bottom = bottom + np.array(values)
axes[0].set_xlabel('Year', fontsize=12)
axes[0].set_ylabel('Number of Visits', fontsize=12)
axes[0].set_title('Quarterly Visits by Year', fontsize=14, fontweight='bold')
axes[0].legend(title='Quarter')
axes[0].grid(axis='y', alpha=0.3)
# Bar chart of total quarterly visits (aggregated)
total_quarterly = df.groupby('Quarter').size().reindex([1, 2, 3, 4], fill_value=0)
quarter_labels = ['Q1', 'Q2', 'Q3', 'Q4']
axes[1].bar(quarter_labels, total_quarterly.values, color=colors_quarters, edgecolor='black')
axes[1].set_xlabel('Quarter', fontsize=12)
axes[1].set_ylabel('Total Number of Visits', fontsize=12)
axes[1].set_title('Total Visits by Quarter (All Years)', fontsize=14, fontweight='bold')
axes[1].grid(axis='y', alpha=0.3)
# Add value labels
for i, v in enumerate(total_quarterly.values):
axes[1].text(i, v + max(total_quarterly.values)*0.02, str(v), ha='center', fontweight='bold')
plt.tight_layout()
plt.show()
# Display table
quarterly_pivot = quarterly_counts.pivot(index='Quarter', columns='Year', values='Visit_Count').fillna(0).astype(int)
quarterly_pivot = quarterly_pivot.reindex([1, 2, 3, 4])
quarterly_pivot.index = ['Q1', 'Q2', 'Q3', 'Q4']
quarterly_pivot['Total'] = quarterly_pivot.sum(axis=1)
quarterly_pivot.loc['Total'] = quarterly_pivot.sum(axis=0)
print("\nQuarterly Breakdown:")
print(quarterly_pivot)
Quarterly Breakdown: Year 2020 2021 2023 2024 2025 2026 Total Q1 1 0 0 3 739 84 827 Q2 0 0 0 0 680 0 680 Q3 0 2 1 6 728 0 737 Q4 1 0 1 5 861 0 868 Total 2 2 2 14 3008 84 3112
8. Trend Analysis Over Time¶
Time series analysis showing visit trends.
In [ ]:
# Filter data to only include 2025
df_2025 = df[df['Year'] == 2025].copy()
# Create time series data
df_2025['Date'] = df_2025['Requested Date'].dt.date
time_series = df_2025.groupby('Date').size().reset_index(name='Visits')
time_series['Date'] = pd.to_datetime(time_series['Date'])
time_series = time_series.sort_values('Date')
# Calculate moving average (7-day and 30-day)
time_series['MA_7'] = time_series['Visits'].rolling(window=7, min_periods=1).mean()
time_series['MA_30'] = time_series['Visits'].rolling(window=30, min_periods=1).mean()
# Create visualization
fig, axes = plt.subplots(2, 1, figsize=(16, 12))
# Daily visits with moving averages
axes[0].plot(time_series['Date'], time_series['Visits'], alpha=0.3, label='Daily Visits', color='gray')
axes[0].plot(time_series['Date'], time_series['MA_7'], label='7-Day Moving Average', linewidth=2, color='blue')
axes[0].plot(time_series['Date'], time_series['MA_30'], label='30-Day Moving Average', linewidth=2, color='red')
axes[0].set_xlabel('Date', fontsize=12)
axes[0].set_ylabel('Number of Visits', fontsize=12)
axes[0].set_title('Daily Visits with Moving Averages', fontsize=14, fontweight='bold')
axes[0].legend()
axes[0].grid(True, alpha=0.3)
axes[0].tick_params(axis='x', rotation=45)
# Monthly trend line
monthly_trend = df_2025.groupby(['Year', 'Month', 'Month_Name']).size().reset_index(name='Visits')
monthly_trend = monthly_trend.sort_values(['Year', 'Month'])
monthly_trend['Period'] = monthly_trend['Year'].astype(str) + '-' + monthly_trend['Month'].astype(str).str.zfill(2)
axes[1].plot(range(len(monthly_trend)), monthly_trend['Visits'], marker='o', linewidth=2, markersize=6, color='darkgreen')
axes[1].set_xticks(range(0, len(monthly_trend), max(1, len(monthly_trend)//12)))
axes[1].set_xticklabels(monthly_trend['Period'][::max(1, len(monthly_trend)//12)], rotation=45, ha='right')
axes[1].set_xlabel('Period (Year-Month)', fontsize=12)
axes[1].set_ylabel('Number of Visits', fontsize=12)
axes[1].set_title('Monthly Visit Trend', fontsize=14, fontweight='bold')
axes[1].grid(True, alpha=0.3)
# Add trend line
z = np.polyfit(range(len(monthly_trend)), monthly_trend['Visits'], 1)
p = np.poly1d(z)
axes[1].plot(range(len(monthly_trend)), p(range(len(monthly_trend))),
"r--", linewidth=2, label=f'Trend (y={z[0]:.2f}x+{z[1]:.2f})')
axes[1].legend()
plt.tight_layout()
plt.show()
# Calculate trend statistics
if len(monthly_trend) > 1:
first_month = monthly_trend['Visits'].iloc[0]
last_month = monthly_trend['Visits'].iloc[-1]
percent_change = ((last_month - first_month) / first_month) * 100
print("\nTrend Analysis:")
print(f"First month visits: {first_month}")
print(f"Last month visits: {last_month}")
print(f"Absolute change: {last_month - first_month:+}")
print(f"Percentage change: {percent_change:+.1f}%")
print(f"Trend slope: {z[0]:.2f} visits per month")
Trend Analysis: First month visits: 267 Last month visits: 275 Absolute change: +8 Percentage change: +3.0% Trend slope: 3.65 visits per month
Most Common Full Postcodes¶
This is a chart of the top 4 postcodes
In [ ]:
# Bar chart of the top 4 full postcodes
full_postcode_counts = df['Postcode'].value_counts().head(4)
colors = plt.cm.Set3(np.linspace(0, 1, len(full_postcode_counts)))
plt.figure(figsize=(12, 6))
bars = plt.bar(range(len(full_postcode_counts)), full_postcode_counts.values, color=colors, edgecolor='black')
plt.xticks(range(len(full_postcode_counts)), full_postcode_counts.index, rotation=45, ha='right')
plt.ylabel('Number of Visits', fontsize=12)
plt.title('Top 10 Full Postcodes by Visit Count', fontsize=14, fontweight='bold')
plt.grid(axis='y', alpha=0.3)
# Add value labels
for i, v in enumerate(full_postcode_counts.values):
plt.text(i, v + max(full_postcode_counts.values)*0.02, str(v), ha='center', fontweight='bold', fontsize=8)
plt.tight_layout()
plt.show()