import pandas as pd
from datetime import datetime
import matplotlib.pyplot as plt
import seaborn as sns


df = pd.read_csv("hotel_booking.csv")
df.head()


# Shape of the Dataset

df.shape

(119390, 36)


# Checking Count & Data Types

df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 119390 entries, 0 to 119389
Data columns (total 36 columns):
 #   Column                          Non-Null Count   Dtype  
---  ------                          --------------   -----  
 0   hotel                           119390 non-null  object 
 1   is_canceled                     119390 non-null  int64  
 2   lead_time                       119390 non-null  int64  
 3   arrival_date_year               119390 non-null  int64  
 4   arrival_date_month              119390 non-null  object 
 5   arrival_date_week_number        119390 non-null  int64  
 6   arrival_date_day_of_month       119390 non-null  int64  
 7   stays_in_weekend_nights         119390 non-null  int64  
 8   stays_in_week_nights            119390 non-null  int64  
 9   adults                          119390 non-null  int64  
 10  children                        119386 non-null  float64
 11  babies                          119390 non-null  int64  
 12  meal                            119390 non-null  object 
 13  country                         118902 non-null  object 
 14  market_segment                  119390 non-null  object 
 15  distribution_channel            119390 non-null  object 
 16  is_repeated_guest               119390 non-null  int64  
 17  previous_cancellations          119390 non-null  int64  
 18  previous_bookings_not_canceled  119390 non-null  int64  
 19  reserved_room_type              119390 non-null  object 
 20  assigned_room_type              119390 non-null  object 
 21  booking_changes                 119390 non-null  int64  
 22  deposit_type                    119390 non-null  object 
 23  agent                           103050 non-null  float64
 24  company                         6797 non-null    float64
 25  days_in_waiting_list            119390 non-null  int64  
 26  customer_type                   119390 non-null  object 
 27  adr                             119390 non-null  float64
 28  required_car_parking_spaces     119390 non-null  int64  
 29  total_of_special_requests       119390 non-null  int64  
 30  reservation_status              119390 non-null  object 
 31  reservation_status_date         119390 non-null  object 
 32  name                            119390 non-null  object 
 33  email                           119390 non-null  object 
 34  phone-number                    119390 non-null  object 
 35  credit_card                     119390 non-null  object 
dtypes: float64(4), int64(16), object(16)
memory usage: 32.8+ MB


# Checking for null values

df.isnull().sum()

hotel                                  0
is_canceled                            0
lead_time                              0
arrival_date_year                      0
arrival_date_month                     0
arrival_date_week_number               0
arrival_date_day_of_month              0
stays_in_weekend_nights                0
stays_in_week_nights                   0
adults                                 0
children                               4
babies                                 0
meal                                   0
country                              488
market_segment                         0
distribution_channel                   0
is_repeated_guest                      0
previous_cancellations                 0
previous_bookings_not_canceled         0
reserved_room_type                     0
assigned_room_type                     0
booking_changes                        0
deposit_type                           0
agent                              16340
company                           112593
days_in_waiting_list                   0
customer_type                          0
adr                                    0
required_car_parking_spaces            0
total_of_special_requests              0
reservation_status                     0
reservation_status_date                0
name                                   0
email                                  0
phone-number                           0
credit_card                            0
dtype: int64


df.drop(columns = ["agent", "company", "arrival_date_week_number", "email", "phone-number", "credit_card", "name"], inplace = True)
df.dropna(inplace = True)


# Converting "reservation_status_date" to datetime type as it was in object type.

df["reservation_status_date"] = pd.to_datetime(df["reservation_status_date"]).dt.date

# Extracting year from "reservation_status_date"
df["reservation_status_year"] = pd.to_datetime(df["reservation_status_date"]).dt.year


# Dropping rows where "reservation_status_year" is 2014

df = df.drop(df[df["reservation_status_year"] == 2014].index)


# Changing object "arrival_date_month" & "children" columns to integer type.

df["arrival_date_month"] = pd.to_datetime(df["arrival_date_month"], format = "%B").dt.month

df["children"] = df["children"].astype(int)


# Assigning categories name to categorical integer columns.

df["is_canceled"] = df["is_canceled"].replace({0 : "Not Canceled", 1 : "Canceled"})
df["is_repeated_guest"] = df["is_repeated_guest"].replace({0 : "Not Repeated", 1 : "Repeated"})

df["is_canceled_binary"] = df["is_canceled"].replace({"Not Canceled" : 0, "Canceled" : 1})


# Define a mapping of month numbers to month names.

month_mapping = { 1 : "January", 2 : "February", 3 : "March", 4 : "April", 5 : "May", 6 : "June", 7 : "July", 8 : "August",
                  9 : "September", 10 : "October", 11 : "November", 12 : "December" }

# Convert the integer month column to month names using the mapping.

exit_month = pd.to_datetime(df["reservation_status_date"]).dt.month
df["exit_month_name"] = exit_month.map(month_mapping)


# Creating a new column adding values for stays during week and weekend nights

df["total_stays_nights"] = df["stays_in_weekend_nights"] + df["stays_in_week_nights"]


# Describing object columns.

df.describe(include = "object")


# Describing integer and float columns.

df.describe()


# Removing the outlier from our dataset

df = df[df["adr"] < 5000]
df.shape

(118716, 33)


df.head().transpose()


# Calculating percentage for types of hotels.

df["hotel"].value_counts(normalize = True) * 100

City Hotel      66.647293
Resort Hotel    33.352707
Name: hotel, dtype: float64


# Visualization of Hotel types

plt.figure(figsize = (8, 4))

ax = sns.countplot(data = df, x = "hotel", edgecolor = "k", palette = "flare")

# Creating annotation which will label each bar with its corresponding count.
for index, value in enumerate(df["hotel"].value_counts(ascending = True)):
    ax.text(index, value, str(int(value)), ha = "center", va = "bottom", fontsize = 10)

plt.title("Reservation Status by Hotel Types", size = 15, weight = "bold", color = "#76448A")
plt.xlabel("Hotel Type")
plt.ylabel("Number of Reservations")
plt.ylim([0, 85000])

plt.show()


# Calculating ratio of Cancellation of Bookings.

df["is_canceled"].value_counts(normalize = True) * 100

Not Canceled    62.961185
Canceled        37.038815
Name: is_canceled, dtype: float64


# Visualization of Cancellation Status

plt.figure(figsize = (8, 4))

ax = sns.countplot(data = df, x = "is_canceled", edgecolor = "k", palette = "flare")

for index, value in enumerate(df["is_canceled"].value_counts()):
    ax.text(index, value, str(int(value)), ha = "center", va = "bottom", fontsize = 10)

plt.title("Cancellation Status", size = 15, weight = "bold", color = "#76448A")
ax.set(xlabel = None)
plt.ylabel("Number of Cancellation")
plt.ylim([0, 80000])

plt.show()


# Visualizing Cancellation status for both the hotel types

plt.figure(figsize = (8, 4))

sns.countplot(data = df, x = "hotel", hue = "is_canceled", edgecolor = "k", palette = "flare")

plt.title("Reservation Status in different Hotels by Cancellations", size = 14, weight = "bold", color = "#76448A")
plt.xlabel("Hotel Type")
plt.ylabel("Number of Reservations")
plt.legend(title = "Cancellation Type", fontsize = 7)

plt.show()


# Generating a bar plot to display the top 5 countries with the highest number of reservations.

# Selecting country name and reservation count using "value_counts()".
country_name = df["country"].value_counts().head(5).index
country_count = df["country"].value_counts().head(5).values

plt.figure(figsize = (8, 4))

ax = sns.barplot(data = df, x = country_name, y = country_count, edgecolor = "k", palette = "flare_r")

for index, value in enumerate(df["country"].value_counts().head()):
    ax.text(index, value, str(int(value)), ha = "center", va = "bottom", fontsize = 10)

plt.title("Countries with Highest Number of Reservations", size = 13, weight = "bold", color = "#76448A")
plt.xlabel("Countries")
plt.ylabel("Number of Reservations")
plt.ylim([0, 55000])

plt.show()


# Generating a list of top 5 countries with their respective reservation cancellation counts.

canceled_data = df[df["is_canceled"] == "Canceled"]

can_country_name = canceled_data["country"].value_counts().head().index
can_country_count = canceled_data["country"].value_counts().head().values

print("Top 5 countries with their reservation cancellation counts:")
print(canceled_data["country"].value_counts().head())

Top 5 countries with their reservation cancellation counts:
PRT    27333
GBR     2453
ESP     2177
FRA     1934
ITA     1333
Name: country, dtype: int64


# Creating a donut chart to visualize the distribution of reservation cancellations among the top 5 countries.

plt.figure(figsize = (6, 5))

palette_color = sns.color_palette("flare_r")
explode = (0.05, 0.05, 0.05, 0.05, 0.05)

plt.pie(can_country_count, labels = can_country_name, colors = palette_color, 
        autopct= "%.1f%%", startangle = 90, explode = explode, pctdistance = 0.8)

# Drawing a white circle at the center to create the hole in the donut chart
centre_circle = plt.Circle((0, 0), 0.65, fc = "white")
fig = plt.gcf()
 
# Adding Circle in Pie chart
fig.gca().add_artist(centre_circle)

plt.axis("equal")
plt.title("Countries with Highest Number of Cancellations", size = 11, weight = "bold", color = "#76448A")

plt.show()


# Generating a correlation heatmap to visualize the relationships between numeric columns in our dataset.

# Selecting only the numeric columns
numeric_columns = df.select_dtypes(include = ["int64", "float64"])

# Calculating the correlation matrix
correlation_matrix = numeric_columns.corr()

# Creating the heatmap using Seaborn
plt.figure(figsize = (10, 8))

sns.heatmap(correlation_matrix, annot = True, cmap = "flare", fmt = ".2f", linewidth = 0.7)

plt.title("Correlation Heatmap", weight = "bold", color = "#76448A")

plt.show()


# Creating subplots of violin and bar plot to examine the correlation between "Lead Time" and "Cancellation Status".

plt.figure(figsize = (13, 10))

plt.subplot(2, 1, 1)

ax1 = sns.violinplot(x = "is_canceled", y = "lead_time", order=["Canceled", "Not Canceled"], data = df, palette = "flare")

ax1.set(xlabel = None)
plt.ylabel("Lead Time")

plt.subplot(2, 1, 2)

groupby_can = df.groupby("is_canceled")["lead_time"].mean()

ax2 = sns.barplot(data = df, x = "is_canceled", y = "lead_time", order=["Canceled", "Not Canceled"], 
                  edgecolor = "k", palette = "flare")

for index, value in enumerate(groupby_can):
    ax2.text(index, value - 50, f"{value:.2f}", ha = "center", va = "bottom", fontsize = 10, color = "white")
    
plt.xlabel("Cancellation Status")
plt.ylabel("Lead Time")

plt.suptitle("Correlation between Lead Time & Cancellation Status",  size = 17, weight = "bold", color = "#76448A")
plt.tight_layout()

plt.show()


# Using subplots of box and count plot to explore the correlation between "Number of Special Requests" & "Cancellation Status".

plt.figure(figsize = (13, 10))

plt.subplot(2, 1, 1)

sns.boxplot(x = "is_canceled", y = "total_of_special_requests", data = df, palette = "flare_r")

plt.xlabel("Cancellation Status")
plt.ylabel("Number of Special Requests")

plt.subplot(2, 1, 2)

sns.countplot(data = df, x = "total_of_special_requests", hue = "is_canceled", edgecolor = "k", palette = "flare")

plt.xlabel("Number of Special Requests")
plt.ylabel("Count")
plt.legend(title = "Cancellation Status")

plt.suptitle("Correlation between Special Requests & Cancellation Status", weight = "bold", color = "#76448A", size = 17)
plt.tight_layout()

plt.show()


# Analyzing Reservation Status Distribution Year-wise

df["reservation_status_year"].value_counts()

2016    57528
2017    36395
2015    24793
Name: reservation_status_year, dtype: int64


# Using a side-by-side bar plot to compare the Year on Year "Reservations in Hotels" and "Cancellations".
# The visualization provides insights into trends and patterns over time.

plt.figure(figsize = (15, 5))

plt.subplot(1, 2, 1)

sns.countplot(data = df, x = "reservation_status_year", hue = "hotel", edgecolor = "k", palette = "flare")

plt.title("Year on Year Reservations in Hotels", size = 15)
plt.xlabel("Year")
plt.ylabel("Number of Reservations")
plt.legend(title = "Hotel Type", fontsize = 13)

plt.subplot(1, 2, 2)

sns.countplot(data = df, x = "reservation_status_year", hue = "is_canceled", edgecolor = "k", palette = "flare")

plt.title("Year on Year Cancellations", size = 15)
plt.xlabel("Year")
plt.ylabel("Number of Cancellations")
plt.legend(title = "Cancellation Type", fontsize = 13)

plt.suptitle("Year on Year Change", size = 17, color = "#76448A", weight = "bold")
plt.tight_layout()  ## To prevent overlapping of the subplots

plt.show()


# Visualizing "Month-wise Bookings" in a descending order, displaying the number of reservations for each month.

plt.figure(figsize = (13, 5))

ax = sns.countplot(data = df, x = "exit_month_name", edgecolor = "k",
                   palette = "flare_r", order = df["exit_month_name"].value_counts().index)

for index, value in enumerate(df["exit_month_name"].value_counts()):
    ax.text(index, value, str(int(value)), ha = "center", va = "bottom", fontsize = 10)

plt.title("Month-wise Bookings", size = 15, color = "#76448A", weight = "bold")
plt.xlabel("Months")
plt.ylabel("Number of Bookings")

plt.show()


# Visualizing "Cancellation Rates Month-wise" for canceled reservations in a descending order,
# displaying the number of cancellations for each month.

plt.figure(figsize = (13, 5))

ax = sns.countplot(data = canceled_data, x = "exit_month_name", edgecolor = "k",
                   palette = "flare_r", order = canceled_data["exit_month_name"].value_counts().index)

for index, value in enumerate(canceled_data["exit_month_name"].value_counts()):
    ax.text(index, value, str(int(value)), ha = "center", va = "bottom", fontsize = 10)

plt.title("Month-wise Cancellations", size = 15, color = "#76448A", weight = "bold")
plt.xlabel("Months")
plt.ylabel("Number of Cancellation")
plt.ylim([0, 6500])

plt.show()


# Visualizing "Non Cancellation Rates Month-wise" for not canceled reservations in a descending order,
# displaying the number of non cancellations for each month.

plt.figure(figsize = (13, 5))

non_canceled_data = df[df["is_canceled"] == "Not Canceled"]

ax = sns.countplot(data = non_canceled_data, x = "exit_month_name", edgecolor = "k",
                   palette = "flare_r", order = non_canceled_data["exit_month_name"].value_counts().index)

for index, value in enumerate(non_canceled_data["exit_month_name"].value_counts()):
    ax.text(index, value, str(int(value)), ha = "center", va = "bottom", fontsize = 10)

plt.title("Month-wise Non Cancellations", size = 15, color = "#76448A", weight = "bold")
plt.xlabel("Months")
plt.ylabel("Number of Non Cancellation")
plt.ylim([0, 9000])

plt.show()


# Using count plot to visualize the month-wise cancellation status,
# displaying the number of reservations for each month, categorized by cancellation type.

plt.figure(figsize = (13, 5))

order = ["January", "February", "March", "April", "May", "June", "July", "August", "September", "October", "November", "December"]

sns.countplot(data = df, x = "exit_month_name", hue = "is_canceled", order = order, edgecolor = "k", palette = "flare")

plt.title("Month-wise Cancellation Status at Check-Out", size = 15, color = "#76448A", weight = "bold")
plt.xlabel("Months")
plt.ylabel("Number of Reservations")
plt.legend(title='Cancellation Type', fontsize = 7)

plt.show()


# Visualizing month-wise ADR for canceled bookings using a bar plot, including annotations for better understanding.

plt.figure(figsize = (13, 5))

df["exit_month_name_ordered"] = pd.Categorical(df["exit_month_name"], categories = order, ordered = True)

grouped_data = df[df["is_canceled"] == "Canceled"].groupby("exit_month_name_ordered")[["adr"]].sum().reset_index()

ax = sns.barplot(x = "exit_month_name_ordered", y = "adr", data = grouped_data, palette = "flare_r", edgecolor = "k")

for index, value in enumerate(grouped_data["adr"]):
    ax.text(index, value, str(int(value)), ha = "center", va = "bottom", fontsize = 9)

plt.title("Month-wise ADR for Canceled Bookings", size = 15, weight = "bold", color = "#76448A")
plt.xlabel("Months")
plt.ylabel("ADR")
plt.ylim([0,650000])

plt.show()


# Calculating and sorting the mean ADR for "Resort Hotel" and "City Hotel" reservations on different reservation status dates.

resort_hotel_adr = df[df["hotel"] == "Resort Hotel"].groupby("reservation_status_date")["adr"].mean().reset_index()
resort_hotel_adr.sort_values("reservation_status_date", inplace = True)

city_hotel_adr = df[df["hotel"] == "City Hotel"].groupby("reservation_status_date")["adr"].mean().reset_index()
city_hotel_adr.sort_values("reservation_status_date", inplace = True)


# Filtering the "resort_hotel_adr" and "city_hotel_adr" DataFrames to include only the ADR data between "07-2015" and "10-2017".
# As the data was not consistent beyond these ranges, skipping these dates was important to get a clear visualization of the trends.

start_date = datetime.strptime("2015-07", "%Y-%m").date()
end_date = datetime.strptime("2017-10", "%Y-%m").date()

resort_hotel_adr = resort_hotel_adr[(resort_hotel_adr["reservation_status_date"] > start_date) &
                                    (resort_hotel_adr["reservation_status_date"] < end_date)]

city_hotel_adr = city_hotel_adr[(city_hotel_adr["reservation_status_date"] > start_date) &
                                (city_hotel_adr["reservation_status_date"] < end_date)]


# Visualizing the "Average Daily Rate by Hotels" using line plots for "Resort" and "City" reservations.
# The plot shows the trends in the average daily rates for both hotel types over the selected date range.

plt.figure(figsize = (20, 6))

colors = sns.color_palette("flare_r")

plt.plot(resort_hotel_adr["reservation_status_date"], resort_hotel_adr["adr"], label = "Resort Hotel", color = colors[0])
plt.plot(city_hotel_adr["reservation_status_date"], city_hotel_adr["adr"], label = "City Hotel", color = colors[4])

plt.title("Average Daily Rate by Hotels", size = 19, weight = "bold", color = "#76448A")
plt.legend(fontsize = 17, loc = "lower right")

plt.show()


# Calculating and sorting the mean ADR for "Canceled" and "Not Canceled" reservations based on different reservation status dates.

canceled_adr = df[df["is_canceled"] == "Canceled"].groupby("reservation_status_date")["adr"].mean().reset_index()
canceled_adr.sort_values("reservation_status_date", inplace = True)

not_canceled_adr = df[df["is_canceled"] == "Not Canceled"].groupby("reservation_status_date")["adr"].mean().reset_index()
not_canceled_adr.sort_values("reservation_status_date", inplace = True)


# Filtering the "canceled_adr" and "not_canceled_adr" DataFrames to include only the ADR data between "07-2015" and "10-2017".
canceled_adr = canceled_adr[(canceled_adr["reservation_status_date"] > start_date) &
                            (canceled_adr["reservation_status_date"] < end_date)]

not_canceled_adr = not_canceled_adr[(not_canceled_adr["reservation_status_date"] > start_date) &
                                    (not_canceled_adr["reservation_status_date"] < end_date)]


# Plotting the "ADR by Cancellations" using line plots for "Canceled" and "Not Canceled" reservations.
# The visualization shows the trends in the average daily rates for both reservation types within the selected date range.

plt.figure(figsize = (20, 6))

colors = sns.color_palette("flare_r")

plt.plot(canceled_adr["reservation_status_date"], canceled_adr["adr"], label = "Canceled", color = colors[0])
plt.plot(not_canceled_adr["reservation_status_date"], not_canceled_adr["adr"], label = "Not Canceled", color = colors[4])

plt.title("Average Daily Rate by Cancellations", size = 19, weight = "bold", color = "#76448A")
plt.legend(fontsize = 17, loc = "lower right")

plt.show()


# Extracting the top 5 values and their corresponding counts for "stays_in_weekend_nights" and "stays_in_week_nights".

weekend_nights_num = df["stays_in_weekend_nights"].value_counts().head(10).index
weekend_nights_count = df["stays_in_weekend_nights"].value_counts().head(10).values

week_nights_num = df["stays_in_week_nights"].value_counts().head(10).index
week_nights_count = df["stays_in_week_nights"].value_counts().head(10).values


# Visualizing "Week & Weekend Nights Bookings" using side-by-side bar plots, 
# displaying count of bookings for different numbers of nights.

plt.figure(figsize = (12, 4))

plt.suptitle("Top 10 Week vs Weekend Bookings", size = 19, weight = "bold", color = "#76448A")

plt.subplot(1, 2, 1)

ax1 = sns.barplot(data = df, x = week_nights_num, y = week_nights_count, 
                  edgecolor = "k", palette = "flare_r", order = week_nights_num)

for index, value in enumerate(df["stays_in_week_nights"].value_counts().head(10)):
    ax1.text(index, value, int(str(value)), ha = "center", va = "bottom", fontsize = 9)

plt.title("Count for Week Nights Bookings", size = 15)
plt.xlabel("Number of Nights")
plt.ylabel("Count of Bookings")
plt.ylim([0, 55000])

plt.subplot(1, 2, 2)

ax2 = sns.barplot(data = df, x = weekend_nights_num, y = weekend_nights_count, 
                  edgecolor = "k", palette = "flare_r", order = weekend_nights_num)

for index, value in enumerate(df["stays_in_weekend_nights"].value_counts().head(10)):
    ax2.text(index, value, int(str(value)), ha = "center", va = "bottom", fontsize = 9)

plt.title("Count for Weekend Nights Bookings", size = 15)
plt.xlabel("Number of Nights")
plt.ylabel("Count of Bookings")
plt.ylim([0, 55000])

plt.tight_layout()

plt.show()


# Visualizing "Bookings for Total Stay" using bar plots, displaying count of bookings for different numbers of nights.

total_stays_num = df["total_stays_nights"].value_counts().head(10).index
total_stays_count = df["total_stays_nights"].value_counts().head(10).values

plt.figure(figsize = (8, 4))

ax = sns.barplot(data = df, x = total_stays_num, y = total_stays_count, order = total_stays_num, 
            edgecolor = "k", palette = "flare_r")

for index, value in enumerate(df["total_stays_nights"].value_counts().head(10)):
    ax.text(index, value, int(str(value)), ha = "center", va = "bottom", fontsize = 9)

plt.title("Count of Bookings for Total Stay", size = 15, weight = "bold", color = "#76448A")
plt.xlabel("Number of Nights")
plt.ylabel("Count of Bookings")
plt.ylim([0, 30000])

plt.show()


# Visualizing "Top 5 Room Types" using bar plots, displaying count of bookings for different types of rooms.

room_type = df["reserved_room_type"].value_counts().head().index
room_type_count = df["reserved_room_type"].value_counts().head().values

plt.figure(figsize = (8, 4))

ax = sns.barplot(data = df, x = room_type, y = room_type_count, order = room_type, 
            edgecolor = "k", palette = "flare_r")

for index, value in enumerate(df["reserved_room_type"].value_counts().head()):
    ax.text(index, value, int(str(value)), ha = "center", va = "bottom", fontsize = 9)

plt.title("Top 5 Room Types Booked by Customers", size = 15, weight = "bold", color = "#76448A")
plt.xlabel("Room Type")
plt.ylabel("Count of Bookings")
plt.ylim([0, 90000])

plt.show()


# Calculating the percentage of canceled reservations when assigned room type is different from booked room type.
# It will help us to see if getting a different room than assigned initially has any impact on cancellations or not.

diff_room = df[df["reserved_room_type"] != df["assigned_room_type"]]
diff_room["is_canceled"].value_counts(normalize = True) * 100

Not Canceled    94.605075
Canceled         5.394925
Name: is_canceled, dtype: float64


# Calculating the percentage distribution of different meal types.

df["meal"].value_counts(normalize = True) * 100

BB           77.227164
HB           12.158429
SC            8.960881
Undefined     0.981334
FB            0.672192
Name: meal, dtype: float64


# Generating a count plot to display the distribution of different meal types.

plt.figure(figsize = (8, 4))

ax = sns.countplot(data = df, x = "meal", order = df["meal"].value_counts().index, edgecolor = "k", palette = "flare_r")

for index, value in enumerate(df["meal"].value_counts()):
    ax.text(index, value, str(int(value)), ha = "center", va = "bottom", fontsize = 10)

plt.title("Types of Meals Opted by Customers", size = 13, weight = "bold", color = "#76448A")
plt.xlabel("Meal")
plt.ylabel("Number of Reservations")
plt.ylim([0, 100000])
    
plt.show()


# Calculating the percentage distribution of different market segments.

df["market_segment"].value_counts(normalize = True) * 100

Online TA        47.510024
Offline TA/TO    20.350248
Groups           16.531891
Direct           10.484686
Corporate         4.305233
Complementary     0.618282
Aviation          0.199636
Name: market_segment, dtype: float64


# Creating a donut chart to visualize the distribution of top 5 market segments.

plt.figure(figsize = (6, 5))

palette_color = sns.color_palette("flare_r")
explode = (0.05, 0.05, 0.05, 0.05, 0.05)

plt.pie(df["market_segment"].value_counts().head().values, labels = df["market_segment"].value_counts().head().index, 
        colors = palette_color, autopct= "%.2f%%", startangle = 90, explode = explode, pctdistance = 0.8)

centre_circle = plt.Circle((0, 0), 0.65, fc = "white")
fig = plt.gcf()
fig.gca().add_artist(centre_circle)

plt.axis("equal")
plt.title("Top 5 Market Segments by Share", size = 11, weight = "bold", color = "#76448A")

plt.show()


# Calculating the percentage distribution of Repeated & Non-Repeated Guests.

df["is_repeated_guest"].value_counts(normalize = True) * 100

Not Repeated    96.945652
Repeated         3.054348
Name: is_repeated_guest, dtype: float64


# Calculating the percentage distribution of the number of "Previous Cancellations" for the top 5 values.

df["previous_cancellations"].value_counts(normalize = True).head() * 100

0     94.721015
1      4.918461
2      0.094343
3      0.054753
24     0.040433
Name: previous_cancellations, dtype: float64


# Calculating the percentage distribution of the number of "Required Car Parking Spaces".

df["required_car_parking_spaces"].value_counts(normalize = True) * 100

0    93.842448
1     6.129755
2     0.023586
3     0.002527
8     0.001685
Name: required_car_parking_spaces, dtype: float64


# Calculating the percentage distribution of different "Customer Types".

df["customer_type"].value_counts(normalize = True) * 100

Transient          75.113717
Transient-Party    20.972742
Contract            3.433404
Group               0.480137
Name: customer_type, dtype: float64


# Generating a count plot to display the distribution of different customer types.

plt.figure(figsize = (8, 4))

ax = sns.countplot(data = df, x = "customer_type", order = df["customer_type"].value_counts().index, 
                   edgecolor = "k", palette = "flare_r")

for index, value in enumerate(df["customer_type"].value_counts()):
    ax.text(index, value, str(int(value)), ha = "center", va = "bottom", fontsize = 10)

plt.title("Types of Customers by Bookings Count", size = 13, weight = "bold", color = "#76448A")
plt.xlabel("Customer Categories")
plt.ylabel("Number of Reservations")
plt.ylim([0, 95000])
    
plt.show()


# Calculating the percentage distribution of "Customer Types" for canceled reservations.

canceled_data["customer_type"].value_counts(normalize = True) * 100

Transient          82.913739
Transient-Party    14.084283
Contract            2.870073
Group               0.131905
Name: customer_type, dtype: float64


# Generating a count plot to display the distribution of different customer types with their cancellation status.

plt.figure(figsize = (8, 4))

ax = sns.countplot(data = df, x = "customer_type", order = df["customer_type"].value_counts().index, hue = "is_canceled",
                   edgecolor = "k", palette = "flare_r")

plt.title("Types of Customers with their Cancellation Status", size = 13, weight = "bold", color = "#76448A")
plt.xlabel("Customer Categories")
plt.ylabel("Number of Reservations")
plt.legend(title = "Cancellation Status")
    
plt.show()


# Calculating the percentage distribution of different "Deposit Types".

df["deposit_type"].value_counts(normalize = True) * 100

No Deposit    87.588868
Non Refund    12.274672
Refundable     0.136460
Name: deposit_type, dtype: float64


# Creating a donut chart to visualize the distribution of the types of deposits made by customers.

plt.figure(figsize = (6, 5))

palette_color = sns.color_palette("flare_r")
explode = (0.05, 0.05, 0.05)

plt.pie(df["deposit_type"].value_counts().head().values, labels = df["deposit_type"].value_counts().head().index, 
        colors = palette_color, autopct= "%.1f%%", startangle = 90, explode = explode, pctdistance = 0.8)

centre_circle = plt.Circle((0, 0), 0.65, fc = "white")
fig = plt.gcf()
fig.gca().add_artist(centre_circle)

plt.axis("equal")
plt.title("Types of Deposits made by Customers", size = 11, weight = "bold", color = "#76448A")

plt.show()


plt.figure(figsize = (8, 4))

ax = sns.countplot(data = df, x = "deposit_type", order = df["deposit_type"].value_counts().index, hue = "is_canceled",
                   edgecolor = "k", palette = "flare_r")

plt.title("Types of Deposits made by Customers with their Cancellation Status", size = 11, weight = "bold", color = "#76448A")
plt.xlabel("Deposit Types")
plt.ylabel("Number of Reservations")
plt.legend(title = "Cancellation Status")
    
plt.show()

	hotel	is_canceled	meal	country	market_segment	distribution_channel	is_repeated_guest	reserved_room_type	assigned_room_type	deposit_type	customer_type	reservation_status	reservation_status_date	exit_month_name
count	118717	118717	118717	118717	118717	118717	118717	118717	118717	118717	118717	118717	118717	118717
unique	2	2	5	177	7	5	2	10	12	3	4	3	924	12
top	City Hotel	Not Canceled	BB	PRT	Online TA	TA/TO	Not Repeated	A	A	No Deposit	Transient	Check-Out	2015-10-21	July
freq	79122	74745	91682	48405	56402	97549	115091	85420	73682	103982	89173	74745	1461	12074

	lead_time	arrival_date_year	arrival_date_month	arrival_date_day_of_month	stays_in_weekend_nights	stays_in_week_nights	adults	children	babies	previous_cancellations	previous_bookings_not_canceled	booking_changes	days_in_waiting_list	adr	required_car_parking_spaces	total_of_special_requests	reservation_status_year	is_canceled_binary	total_stays_nights
count	118717.000000	118717.000000	118717.000000	118717.000000	118717.000000	118717.000000	118717.000000	118717.000000	118717.000000	118717.000000	118717.000000	118717.000000	118717.000000	118717.000000	118717.000000	118717.000000	118717.000000	118717.000000	118717.000000
mean	104.029617	2016.159421	6.551345	15.800854	0.930305	2.502902	1.858175	0.104366	0.007960	0.085582	0.131835	0.221518	2.334308	102.063542	0.061979	0.572555	2016.097728	0.370393	3.433207
std	106.737483	0.706551	3.089032	8.779217	0.996318	1.901514	0.578990	0.399456	0.097454	0.843575	1.485794	0.653225	17.643652	50.500364	0.244346	0.792967	0.711241	0.482912	2.546266
min	0.000000	2015.000000	1.000000	1.000000	0.000000	0.000000	0.000000	0.000000	0.000000	0.000000	0.000000	0.000000	0.000000	-6.380000	0.000000	0.000000	2015.000000	0.000000	0.000000
25%	18.000000	2016.000000	4.000000	8.000000	0.000000	1.000000	2.000000	0.000000	0.000000	0.000000	0.000000	0.000000	0.000000	70.000000	0.000000	0.000000	2016.000000	0.000000	2.000000
50%	69.000000	2016.000000	7.000000	16.000000	1.000000	2.000000	2.000000	0.000000	0.000000	0.000000	0.000000	0.000000	0.000000	95.000000	0.000000	0.000000	2016.000000	0.000000	3.000000
75%	160.000000	2017.000000	9.000000	23.000000	2.000000	3.000000	2.000000	0.000000	0.000000	0.000000	0.000000	0.000000	0.000000	126.000000	0.000000	1.000000	2017.000000	1.000000	4.000000
max	737.000000	2017.000000	12.000000	31.000000	16.000000	41.000000	55.000000	10.000000	10.000000	26.000000	72.000000	21.000000	391.000000	5400.000000	8.000000	5.000000	2017.000000	1.000000	57.000000

Introduction to Business Problem

Objective

Dataset Description

Importing Libraries

Loading the Dataset

Data Formatting and Cleaning

Data Analysis and Visualization

Note :

Note :

Note :

Interesting Insights:

Note :

Week Nights

Weekend Nights

Summary :

	hotel	lead_time	arrival_date_year	arrival_date_month	arrival_date_week_number	arrival_date_day_of_month	stays_in_week_nights	adults	...	customer_type	adr	total_of_special_requests	reservation_status	reservation_status_date	name	email	phone-number	credit_card
0	Resort Hotel	342	2015	July	27	1	0	2	...	Transient	0.0	0	Check-Out	2015-07-01	Ernest Barnes	Ernest.Barnes31@outlook.com	669-792-1661	************4322
1	Resort Hotel	737	2015	July	27	1	0	2	...	Transient	0.0	0	Check-Out	2015-07-01	Andrea Baker	Andrea_Baker94@aol.com	858-637-6955	************9157
2	Resort Hotel	7	2015	July	27	1	1	1	...	Transient	75.0	0	Check-Out	2015-07-02	Rebecca Parker	Rebecca_Parker@comcast.net	652-885-2745	************3734
3	Resort Hotel	13	2015	July	27	1	1	1	...	Transient	75.0	0	Check-Out	2015-07-02	Laura Murray	Laura_M@gmail.com	364-656-8427	************5677
4	Resort Hotel	14	2015	July	27	1	2	2	...	Transient	98.0	1	Check-Out	2015-07-03	Linda Hines	LHines@verizon.com	713-226-5883	************5498