import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

df = pd.read_csv('AB_NYC_2019.csv')

df.head()

df.columns

Index(['id', 'name', 'host_id', 'host_name', 'neighbourhood_group',
       'neighbourhood', 'latitude', 'longitude', 'room_type', 'price',
       'minimum_nights', 'number_of_reviews', 'last_review',
       'reviews_per_month', 'calculated_host_listings_count',
       'availability_365'],
      dtype='object')

df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 48895 entries, 0 to 48894
Data columns (total 16 columns):
 #   Column                          Non-Null Count  Dtype  
---  ------                          --------------  -----  
 0   id                              48895 non-null  int64  
 1   name                            48879 non-null  object 
 2   host_id                         48895 non-null  int64  
 3   host_name                       48874 non-null  object 
 4   neighbourhood_group             48895 non-null  object 
 5   neighbourhood                   48895 non-null  object 
 6   latitude                        48895 non-null  float64
 7   longitude                       48895 non-null  float64
 8   room_type                       48895 non-null  object 
 9   price                           48895 non-null  int64  
 10  minimum_nights                  48895 non-null  int64  
 11  number_of_reviews               48895 non-null  int64  
 12  last_review                     38843 non-null  object 
 13  reviews_per_month               38843 non-null  float64
 14  calculated_host_listings_count  48895 non-null  int64  
 15  availability_365                48895 non-null  int64  
dtypes: float64(3), int64(7), object(6)
memory usage: 6.0+ MB

df.describe()

df.shape

(48895, 16)

print("Data Types:\n")
print(df.dtypes)

Data Types:

id                                  int64
name                               object
host_id                             int64
host_name                          object
neighbourhood_group                object
neighbourhood                      object
latitude                          float64
longitude                         float64
room_type                          object
price                               int64
minimum_nights                      int64
number_of_reviews                   int64
last_review                        object
reviews_per_month                 float64
calculated_host_listings_count      int64
availability_365                    int64
dtype: object

print("Price Statistics:\n")
print(df['price'].describe())

Price Statistics:

count    48895.000000
mean       152.720687
std        240.154170
min          0.000000
25%         69.000000
50%        106.000000
75%        175.000000
max      10000.000000
Name: price, dtype: float64

print(df['neighbourhood_group'].value_counts())

neighbourhood_group
Manhattan        21661
Brooklyn         20104
Queens            5666
Bronx             1091
Staten Island      373
Name: count, dtype: int64

print(df['room_type'].value_counts())

room_type
Entire home/apt    25409
Private room       22326
Shared room         1160
Name: count, dtype: int64

print(df['availability_365'].describe())

count    48895.000000
mean       112.781327
std        131.622289
min          0.000000
25%          0.000000
50%         45.000000
75%        227.000000
max        365.000000
Name: availability_365, dtype: float64

missing_values = df.isnull().sum()

print("Missing Values Per Column:\n")
print(missing_values)

Missing Values Per Column:

id                                    0
name                                 16
host_id                               0
host_name                            21
neighbourhood_group                   0
neighbourhood                         0
latitude                              0
longitude                             0
room_type                             0
price                                 0
minimum_nights                        0
number_of_reviews                     0
last_review                       10052
reviews_per_month                 10052
calculated_host_listings_count        0
availability_365                      0
dtype: int64

duplicates = df.duplicated().sum()

print("Total Duplicate Rows:", duplicates)

Total Duplicate Rows: 0

categorical_columns = df.select_dtypes(include=['object']).columns

for col in categorical_columns:
    print(f"\nColumn: {col}")
    print(df[col].nunique())

Column: name
47905

Column: host_name
11452

Column: neighbourhood_group
5

Column: neighbourhood
221

Column: room_type
3

Column: last_review
1764

numerical_columns = df.select_dtypes(include=['int64', 'float64']).columns
categorical_columns = df.select_dtypes(include=['object']).columns

print("Numerical Columns:\n")
print(numerical_columns)

print("\nCategorical Columns:\n")
print(categorical_columns)

Numerical Columns:

Index(['id', 'host_id', 'latitude', 'longitude', 'price', 'minimum_nights',
       'number_of_reviews', 'reviews_per_month',
       'calculated_host_listings_count', 'availability_365'],
      dtype='object')

Categorical Columns:

Index(['name', 'host_name', 'neighbourhood_group', 'neighbourhood',
       'room_type', 'last_review'],
      dtype='object')

correlation_matrix = df.corr(numeric_only=True)

plt.figure(figsize=(12,8))
sns.heatmap(correlation_matrix, annot=True, cmap='coolwarm')
plt.title('Correlation Matrix')
plt.show()

import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

sns.set_style('whitegrid')

df = pd.read_csv('AB_NYC_2019.csv')

df.head()

df.info()

<class 'pandas.DataFrame'>
RangeIndex: 48895 entries, 0 to 48894
Data columns (total 16 columns):
 #   Column                          Non-Null Count  Dtype  
---  ------                          --------------  -----  
 0   id                              48895 non-null  int64  
 1   name                            48879 non-null  str    
 2   host_id                         48895 non-null  int64  
 3   host_name                       48874 non-null  str    
 4   neighbourhood_group             48895 non-null  str    
 5   neighbourhood                   48895 non-null  str    
 6   latitude                        48895 non-null  float64
 7   longitude                       48895 non-null  float64
 8   room_type                       48895 non-null  str    
 9   price                           48895 non-null  int64  
 10  minimum_nights                  48895 non-null  int64  
 11  number_of_reviews               48895 non-null  int64  
 12  last_review                     38843 non-null  str    
 13  reviews_per_month               38843 non-null  float64
 14  calculated_host_listings_count  48895 non-null  int64  
 15  availability_365                48895 non-null  int64  
dtypes: float64(3), int64(7), str(6)
memory usage: 6.0 MB

df.describe(include='all')

df['last_review'] = pd.to_datetime(df['last_review'])

categorical_columns = [
    'neighbourhood_group',
    'neighbourhood',
    'room_type'
]

for col in categorical_columns:
    df[col] = df[col].astype('category')

df.dtypes

id                                         int64
name                                         str
host_id                                    int64
host_name                                    str
neighbourhood_group                     category
neighbourhood                           category
latitude                                 float64
longitude                                float64
room_type                               category
price                                      int64
minimum_nights                             int64
number_of_reviews                          int64
last_review                       datetime64[us]
reviews_per_month                        float64
calculated_host_listings_count             int64
availability_365                           int64
dtype: object

df.isnull().sum()

id                                    0
name                                 16
host_id                               0
host_name                            21
neighbourhood_group                   0
neighbourhood                         0
latitude                              0
longitude                             0
room_type                             0
price                                 0
minimum_nights                        0
number_of_reviews                     0
last_review                       10052
reviews_per_month                 10052
calculated_host_listings_count        0
availability_365                      0
dtype: int64

print("Rows before duplicate removal:", len(df))

df = df.drop_duplicates()

print("Rows after duplicate removal:", len(df))

Rows before duplicate removal: 48895
Rows after duplicate removal: 48895

df['reviews_per_month'] = df['reviews_per_month'].fillna(0)

df['name'] = df['name'].fillna('Unknown')

df['host_name'] = df['host_name'].fillna('Unknown')

print("Rows before removing invalid prices:", len(df))

df = df[df['price'] > 0]

print("Rows after removing invalid prices:", len(df))

Rows before removing invalid prices: 48895
Rows after removing invalid prices: 48884

plt.figure(figsize=(8,5))
sns.boxplot(y=df['price'])
plt.title('Price Before Outlier Removal')
plt.show()

df = df[df['price'] < 1000]

plt.figure(figsize=(8,5))
sns.boxplot(y=df['price'])
plt.title('Price After Outlier Removal')
plt.show()

df = df[df['minimum_nights'] <= 365]

df['month'] = df['last_review'].dt.month

def get_season(month):
    if month in [12, 1, 2]:
        return 'Winter'
    elif month in [3, 4, 5]:
        return 'Spring'
    elif month in [6, 7, 8]:
        return 'Summer'
    else:
        return 'Autumn'

df['season'] = df['month'].apply(get_season)

df[['last_review', 'month', 'season']].head()

df.info()

<class 'pandas.DataFrame'>
Index: 48572 entries, 0 to 48894
Data columns (total 18 columns):
 #   Column                          Non-Null Count  Dtype         
---  ------                          --------------  -----         
 0   id                              48572 non-null  int64         
 1   name                            48572 non-null  str           
 2   host_id                         48572 non-null  int64         
 3   host_name                       48572 non-null  str           
 4   neighbourhood_group             48572 non-null  category      
 5   neighbourhood                   48572 non-null  category      
 6   latitude                        48572 non-null  float64       
 7   longitude                       48572 non-null  float64       
 8   room_type                       48572 non-null  category      
 9   price                           48572 non-null  int64         
 10  minimum_nights                  48572 non-null  int64         
 11  number_of_reviews               48572 non-null  int64         
 12  last_review                     38690 non-null  datetime64[us]
 13  reviews_per_month               48572 non-null  float64       
 14  calculated_host_listings_count  48572 non-null  int64         
 15  availability_365                48572 non-null  int64         
 16  month                           38690 non-null  float64       
 17  season                          48572 non-null  str           
dtypes: category(3), datetime64[us](1), float64(4), int64(7), str(3)
memory usage: 6.1 MB

df.describe(include='all')

df.to_csv('airbnb_cleaned.csv', index=False)

print("Cleaned dataset saved as airbnb_cleaned.csv")

Cleaned dataset saved as airbnb_cleaned.csv

import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

sns.set_style('whitegrid')

df = pd.read_csv('airbnb_cleaned.csv')
df = df[df['price'] > 0]

df.head()

df.info()

<class 'pandas.DataFrame'>
RangeIndex: 48572 entries, 0 to 48571
Data columns (total 18 columns):
 #   Column                          Non-Null Count  Dtype  
---  ------                          --------------  -----  
 0   id                              48572 non-null  int64  
 1   name                            48572 non-null  str    
 2   host_id                         48572 non-null  int64  
 3   host_name                       48572 non-null  str    
 4   neighbourhood_group             48572 non-null  str    
 5   neighbourhood                   48572 non-null  str    
 6   latitude                        48572 non-null  float64
 7   longitude                       48572 non-null  float64
 8   room_type                       48572 non-null  str    
 9   price                           48572 non-null  int64  
 10  minimum_nights                  48572 non-null  int64  
 11  number_of_reviews               48572 non-null  int64  
 12  last_review                     38690 non-null  str    
 13  reviews_per_month               48572 non-null  float64
 14  calculated_host_listings_count  48572 non-null  int64  
 15  availability_365                48572 non-null  int64  
 16  month                           38690 non-null  float64
 17  season                          48572 non-null  str    
dtypes: float64(4), int64(7), str(7)
memory usage: 6.7 MB

df.describe()

df.isnull().sum()

id                                   0
name                                 0
host_id                              0
host_name                            0
neighbourhood_group                  0
neighbourhood                        0
latitude                             0
longitude                            0
room_type                            0
price                                0
minimum_nights                       0
number_of_reviews                    0
last_review                       9882
reviews_per_month                    0
calculated_host_listings_count       0
availability_365                     0
month                             9882
season                               0
dtype: int64

plt.figure(figsize=(10,3))
sns.histplot(df['price'], bins=100)

plt.title('Distribution of Airbnb Prices')
plt.xlabel('Price')
plt.ylabel('Count')

plt.show()

plt.figure(figsize=(14,6))

sns.boxplot(
    x='neighbourhood_group',
    y='price',
    data=df
)

plt.xticks(rotation=45)

plt.title('Price Distribution by Neighborhood')
plt.xlabel('Neighborhood')
plt.ylabel('Price')

plt.show()

plt.figure(figsize=(8,5))

sns.barplot(
    x='room_type',
    y='price',
    data=df
)

plt.title('Average Price by Room Type')
plt.xlabel('Room Type')
plt.ylabel('Average Price')

plt.show()

monthly_prices = df.groupby('month')['price'].mean()

plt.figure(figsize=(10,5))

monthly_prices.plot(marker='o')

plt.title('Average Airbnb Prices by Month')
plt.xlabel('Month')
plt.ylabel('Average Price')

plt.show()

plt.figure(figsize=(10,6))

sns.scatterplot(
    x='number_of_reviews',
    y='price',
    data=df
)

plt.title('Reviews vs Price')
plt.xlabel('Number of Reviews')
plt.ylabel('Price')

plt.show()

plt.figure(figsize=(10,5))

sns.histplot(df['availability_365'], bins=40)

plt.title('Availability Distribution')
plt.xlabel('Days Available')
plt.ylabel('Count')

plt.show()

plt.figure(figsize=(10,6))

numeric_df = df.select_dtypes(include=np.number)

sns.heatmap(
    numeric_df.corr(),
    annot=True,
    cmap='coolwarm'
)

plt.title('Correlation Heatmap')

plt.show()

import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

from sklearn.preprocessing import MinMaxScaler
from sklearn.model_selection import train_test_split

import tensorflow as tf
from tensorflow import keras
from tensorflow.keras import layers

print("TensorFlow version:", tf.__version__)
print("All libraries loaded successfully ✓")

TensorFlow version: 2.20.0
All libraries loaded successfully ✓

from google.colab import files
uploaded = files.upload()

#Load the cleaned dataset from Section 2
filename = list(uploaded.keys())[0]
df = pd.read_csv(filename)

print(f"Loaded file: {filename}")
print(f"Dataset shape: {df.shape}")
df.head()

Saving airbnb_cleaned.csv to airbnb_cleaned (2).csv
Loaded file: airbnb_cleaned (2).csv
Dataset shape: (48572, 18)

FEATURES = [
    'price',
    'minimum_nights',
    'number_of_reviews',
    'reviews_per_month',
    'calculated_host_listings_count',
    'availability_365'
]

data = df[FEATURES].copy()
data = data.dropna()

print(f"Rows after dropping NaN: {len(data)}")
data.describe()

Rows after dropping NaN: 48572

#Filter to normal listings for training
normal_mask = (
    (data['price'] >= 10) & (data['price'] <= 500) &
    (data['minimum_nights'] <= 90)
)

data_normal = data[normal_mask]
print(f"Normal listings used for training: {len(data_normal)} / {len(data)}")
print(f"Potential anomalies (excluded from training): {len(data) - len(data_normal)}")

Normal listings used for training: 47656 / 48572
Potential anomalies (excluded from training): 916

#Scale all values to the range [0, 1]
#Neural networks work best when all features are on the same scale.

scaler = MinMaxScaler()

#Fit the scaler on NORMAL data only, then transform the full dataset
X_train_raw = scaler.fit_transform(data_normal)
X_all_scaled = scaler.transform(data)   #Scale full dataset for later anomaly scoring

# Train / Validation split (80% train, 20% validation)
X_train, X_val = train_test_split(X_train_raw, test_size=0.2, random_state=42)

print(f"Training set shape:   {X_train.shape}")
print(f"Validation set shape: {X_val.shape}")
print(f"Full dataset shape:   {X_all_scaled.shape}")

Training set shape:   (38124, 6)
Validation set shape: (9532, 6)
Full dataset shape:   (48572, 6)

#Here I am setting random seed for reproducibility ──
tf.random.set_seed(42)
np.random.seed(42)

INPUT_DIM = X_train.shape[1]   #Number of features(6)

#Building the Autoencoder, started from this part
def build_autoencoder(input_dim):
    inputs = keras.Input(shape=(input_dim,), name='input')

    #Encoder
    x = layers.Dense(32, activation='relu', name='encoder_1')(inputs)
    x = layers.Dense(16, activation='relu', name='encoder_2')(x)
    encoded = layers.Dense(8,  activation='relu', name='bottleneck')(x)

    #Decoder
    x = layers.Dense(16, activation='relu', name='decoder_1')(encoded)
    x = layers.Dense(32, activation='relu', name='decoder_2')(x)
    decoded = layers.Dense(input_dim, activation='sigmoid', name='output')(x)

    autoencoder = keras.Model(inputs, decoded, name='autoencoder')
    return autoencoder


autoencoder = build_autoencoder(INPUT_DIM)
autoencoder.compile(
    optimizer='adam',
    loss='mse'           #Mean Squared Error: measures how different input vs output is
)
autoencoder.summary()

Model: "autoencoder"

┏━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━┳━━━━━━━━━━━━━━━━━━━━━━━━┳━━━━━━━━━━━━━━━┓
┃ Layer (type)                    ┃ Output Shape           ┃       Param # ┃
┡━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━╇━━━━━━━━━━━━━━━━━━━━━━━━╇━━━━━━━━━━━━━━━┩
│ input (InputLayer)              │ (None, 6)              │             0 │
├─────────────────────────────────┼────────────────────────┼───────────────┤
│ encoder_1 (Dense)               │ (None, 32)             │           224 │
├─────────────────────────────────┼────────────────────────┼───────────────┤
│ encoder_2 (Dense)               │ (None, 16)             │           528 │
├─────────────────────────────────┼────────────────────────┼───────────────┤
│ bottleneck (Dense)              │ (None, 8)              │           136 │
├─────────────────────────────────┼────────────────────────┼───────────────┤
│ decoder_1 (Dense)               │ (None, 16)             │           144 │
├─────────────────────────────────┼────────────────────────┼───────────────┤
│ decoder_2 (Dense)               │ (None, 32)             │           544 │
├─────────────────────────────────┼────────────────────────┼───────────────┤
│ output (Dense)                  │ (None, 6)              │           198 │
└─────────────────────────────────┴────────────────────────┴───────────────┘

 Total params: 1,774 (6.93 KB)

 Trainable params: 1,774 (6.93 KB)

 Non-trainable params: 0 (0.00 B)

#Early stopping: stop training if validation loss stops improving
early_stop = keras.callbacks.EarlyStopping(
    monitor='val_loss',
    patience=5,           #Stop after 5 epochs with no improvement
    restore_best_weights=True
)

history = autoencoder.fit(
    X_train, X_train,         #Input = Target (we want the model to reconstruct itself)
    epochs=50,
    batch_size=256,
    validation_data=(X_val, X_val),
    callbacks=[early_stop],
    verbose=1
)

print("\nTraining complete ✓")

Epoch 1/50
149/149 ━━━━━━━━━━━━━━━━━━━━ 2s 4ms/step - loss: 0.0908 - val_loss: 0.0231
Epoch 2/50
149/149 ━━━━━━━━━━━━━━━━━━━━ 0s 3ms/step - loss: 0.0131 - val_loss: 0.0101
Epoch 3/50
149/149 ━━━━━━━━━━━━━━━━━━━━ 0s 3ms/step - loss: 0.0068 - val_loss: 0.0048
Epoch 4/50
149/149 ━━━━━━━━━━━━━━━━━━━━ 0s 3ms/step - loss: 0.0041 - val_loss: 0.0036
Epoch 5/50
149/149 ━━━━━━━━━━━━━━━━━━━━ 0s 3ms/step - loss: 0.0033 - val_loss: 0.0030
Epoch 6/50
149/149 ━━━━━━━━━━━━━━━━━━━━ 0s 3ms/step - loss: 0.0025 - val_loss: 0.0022
Epoch 7/50
149/149 ━━━━━━━━━━━━━━━━━━━━ 0s 3ms/step - loss: 0.0020 - val_loss: 0.0019
Epoch 8/50
149/149 ━━━━━━━━━━━━━━━━━━━━ 1s 4ms/step - loss: 0.0018 - val_loss: 0.0017
Epoch 9/50
149/149 ━━━━━━━━━━━━━━━━━━━━ 1s 5ms/step - loss: 0.0016 - val_loss: 0.0016
Epoch 10/50
149/149 ━━━━━━━━━━━━━━━━━━━━ 1s 5ms/step - loss: 0.0016 - val_loss: 0.0015
Epoch 11/50
149/149 ━━━━━━━━━━━━━━━━━━━━ 0s 3ms/step - loss: 0.0015 - val_loss: 0.0015
Epoch 12/50
149/149 ━━━━━━━━━━━━━━━━━━━━ 0s 3ms/step - loss: 0.0015 - val_loss: 0.0014
Epoch 13/50
149/149 ━━━━━━━━━━━━━━━━━━━━ 0s 3ms/step - loss: 0.0015 - val_loss: 0.0014
Epoch 14/50
149/149 ━━━━━━━━━━━━━━━━━━━━ 0s 3ms/step - loss: 0.0014 - val_loss: 0.0014
Epoch 15/50
149/149 ━━━━━━━━━━━━━━━━━━━━ 0s 3ms/step - loss: 0.0014 - val_loss: 0.0014
Epoch 16/50
149/149 ━━━━━━━━━━━━━━━━━━━━ 0s 3ms/step - loss: 0.0014 - val_loss: 0.0014
Epoch 17/50
149/149 ━━━━━━━━━━━━━━━━━━━━ 0s 3ms/step - loss: 0.0014 - val_loss: 0.0013
Epoch 18/50
149/149 ━━━━━━━━━━━━━━━━━━━━ 0s 3ms/step - loss: 0.0014 - val_loss: 0.0013
Epoch 19/50
149/149 ━━━━━━━━━━━━━━━━━━━━ 0s 3ms/step - loss: 0.0014 - val_loss: 0.0013
Epoch 20/50
149/149 ━━━━━━━━━━━━━━━━━━━━ 0s 3ms/step - loss: 0.0014 - val_loss: 0.0013
Epoch 21/50
149/149 ━━━━━━━━━━━━━━━━━━━━ 0s 3ms/step - loss: 0.0014 - val_loss: 0.0013
Epoch 22/50
149/149 ━━━━━━━━━━━━━━━━━━━━ 0s 3ms/step - loss: 0.0014 - val_loss: 0.0013
Epoch 23/50
149/149 ━━━━━━━━━━━━━━━━━━━━ 0s 3ms/step - loss: 0.0014 - val_loss: 0.0013
Epoch 24/50
149/149 ━━━━━━━━━━━━━━━━━━━━ 0s 3ms/step - loss: 0.0014 - val_loss: 0.0013
Epoch 25/50
149/149 ━━━━━━━━━━━━━━━━━━━━ 0s 3ms/step - loss: 0.0014 - val_loss: 0.0013
Epoch 26/50
149/149 ━━━━━━━━━━━━━━━━━━━━ 0s 3ms/step - loss: 0.0014 - val_loss: 0.0013
Epoch 27/50
149/149 ━━━━━━━━━━━━━━━━━━━━ 0s 3ms/step - loss: 0.0014 - val_loss: 0.0013
Epoch 28/50
149/149 ━━━━━━━━━━━━━━━━━━━━ 0s 3ms/step - loss: 0.0014 - val_loss: 0.0013
Epoch 29/50
149/149 ━━━━━━━━━━━━━━━━━━━━ 1s 3ms/step - loss: 0.0013 - val_loss: 0.0013
Epoch 30/50
149/149 ━━━━━━━━━━━━━━━━━━━━ 0s 3ms/step - loss: 0.0013 - val_loss: 0.0013
Epoch 31/50
149/149 ━━━━━━━━━━━━━━━━━━━━ 0s 3ms/step - loss: 0.0013 - val_loss: 0.0013
Epoch 32/50
149/149 ━━━━━━━━━━━━━━━━━━━━ 1s 5ms/step - loss: 0.0013 - val_loss: 0.0013
Epoch 33/50
149/149 ━━━━━━━━━━━━━━━━━━━━ 1s 6ms/step - loss: 0.0013 - val_loss: 0.0013
Epoch 34/50
149/149 ━━━━━━━━━━━━━━━━━━━━ 1s 5ms/step - loss: 0.0013 - val_loss: 0.0013
Epoch 35/50
149/149 ━━━━━━━━━━━━━━━━━━━━ 1s 4ms/step - loss: 0.0013 - val_loss: 0.0013
Epoch 36/50
149/149 ━━━━━━━━━━━━━━━━━━━━ 0s 3ms/step - loss: 0.0013 - val_loss: 0.0013
Epoch 37/50
149/149 ━━━━━━━━━━━━━━━━━━━━ 0s 3ms/step - loss: 0.0013 - val_loss: 0.0013
Epoch 38/50
149/149 ━━━━━━━━━━━━━━━━━━━━ 0s 3ms/step - loss: 0.0013 - val_loss: 0.0013
Epoch 39/50
149/149 ━━━━━━━━━━━━━━━━━━━━ 0s 3ms/step - loss: 0.0013 - val_loss: 0.0013
Epoch 40/50
149/149 ━━━━━━━━━━━━━━━━━━━━ 0s 3ms/step - loss: 0.0013 - val_loss: 0.0013
Epoch 41/50
149/149 ━━━━━━━━━━━━━━━━━━━━ 0s 3ms/step - loss: 0.0013 - val_loss: 0.0013
Epoch 42/50
149/149 ━━━━━━━━━━━━━━━━━━━━ 0s 3ms/step - loss: 0.0013 - val_loss: 0.0013
Epoch 43/50
149/149 ━━━━━━━━━━━━━━━━━━━━ 0s 3ms/step - loss: 0.0013 - val_loss: 0.0013
Epoch 44/50
149/149 ━━━━━━━━━━━━━━━━━━━━ 0s 3ms/step - loss: 0.0013 - val_loss: 0.0013
Epoch 45/50
149/149 ━━━━━━━━━━━━━━━━━━━━ 0s 3ms/step - loss: 0.0013 - val_loss: 0.0013
Epoch 46/50
149/149 ━━━━━━━━━━━━━━━━━━━━ 0s 3ms/step - loss: 0.0013 - val_loss: 0.0013
Epoch 47/50
149/149 ━━━━━━━━━━━━━━━━━━━━ 1s 3ms/step - loss: 0.0013 - val_loss: 0.0013
Epoch 48/50
149/149 ━━━━━━━━━━━━━━━━━━━━ 0s 3ms/step - loss: 0.0013 - val_loss: 0.0013
Epoch 49/50
149/149 ━━━━━━━━━━━━━━━━━━━━ 0s 3ms/step - loss: 0.0013 - val_loss: 0.0010
Epoch 50/50
149/149 ━━━━━━━━━━━━━━━━━━━━ 0s 3ms/step - loss: 8.9533e-04 - val_loss: 4.1670e-04

Training complete ✓

plt.figure(figsize=(10, 4))
plt.plot(history.history['loss'],     label='Training Loss',   linewidth=2)
plt.plot(history.history['val_loss'], label='Validation Loss', linewidth=2, linestyle='--')
plt.title('Autoencoder Training Loss', fontsize=14)
plt.xlabel('Epoch')
plt.ylabel('MSE Loss')
plt.legend()
plt.tight_layout()
plt.savefig('training_loss.png', dpi=150)
plt.show()

# Interpretation
# A good model shows both lines going DOWN and converging.
# If validation loss rises while training loss keeps falling = overfitting.

# Reconstructed all listings
X_pred = autoencoder.predict(X_all_scaled, verbose=0)

#Reconstruction error per listing (MSE)
reconstruction_errors = np.mean(np.power(X_all_scaled - X_pred, 2), axis=1)

print(f"Reconstruction Error Statistics:")
print(f"  Min:    {reconstruction_errors.min():.6f}")
print(f"  Max:    {reconstruction_errors.max():.6f}")
print(f"  Mean:   {reconstruction_errors.mean():.6f}")
print(f"  Median: {np.median(reconstruction_errors):.6f}")

Reconstruction Error Statistics:
  Min:    0.000000
  Max:    1.769343
  Mean:   0.002722
  Median: 0.000183

# calculate threshold from training data reconstruction errors.
train_preds = autoencoder.predict(X_train, verbose=0)
train_errors = np.mean(np.power(X_train - train_preds, 2), axis=1)

THRESHOLD = np.percentile(train_errors, 95)
print(f"Anomaly Threshold (95th percentile of training errors): {THRESHOLD:.6f}")

#Label anomalies
is_anomaly = reconstruction_errors > THRESHOLD
print(f"\nTotal listings:  {len(is_anomaly)}")
print(f"Anomalies found: {is_anomaly.sum()} ({is_anomaly.mean()*100:.1f}%)")

Anomaly Threshold (95th percentile of training errors): 0.001538

Total listings:  48572
Anomalies found: 3287 (6.8%)

fig, axes = plt.subplots(1, 2, figsize=(14, 5))

#Plot 1: Full distribution
axes[0].hist(reconstruction_errors, bins=100, color='steelblue', edgecolor='white', alpha=0.8)
axes[0].axvline(THRESHOLD, color='red', linestyle='--', linewidth=2, label=f'Threshold = {THRESHOLD:.4f}')
axes[0].set_title('Reconstruction Error Distribution (All Listings)', fontsize=13)
axes[0].set_xlabel('Reconstruction Error (MSE)')
axes[0].set_ylabel('Count')
axes[0].legend()

#Plot 2: Zoom in (exclude top 1% for readability)
clip = np.percentile(reconstruction_errors, 99)
clipped = reconstruction_errors[reconstruction_errors <= clip]
axes[1].hist(clipped, bins=100, color='coral', edgecolor='white', alpha=0.8)
axes[1].axvline(THRESHOLD, color='red', linestyle='--', linewidth=2, label=f'Threshold = {THRESHOLD:.4f}')
axes[1].set_title('Reconstruction Error Distribution (Zoomed – Bottom 99%)', fontsize=13)
axes[1].set_xlabel('Reconstruction Error (MSE)')
axes[1].set_ylabel('Count')
axes[1].legend()

plt.tight_layout()
plt.savefig('reconstruction_error_distribution.png', dpi=150)
plt.show()

#Added anomaly results back to the original dataframe
results_df = df.loc[data.index].copy().reset_index(drop=True)
results_df['anomaly_score'] = reconstruction_errors
results_df['is_anomaly'] = is_anomaly

print("Sample of anomalies:")
results_df[results_df['is_anomaly']].sort_values('anomaly_score', ascending=False).head(10)

Sample of anomalies:

#Top Overpriced Listings (high price anomaly lisitings)
overpriced = results_df[
    results_df['is_anomaly'] & (results_df['price'] > results_df['price'].median())
].sort_values('price', ascending=False)

print("Top 10 High-price Anomaly Listings:")
print(overpriced[['price', 'neighbourhood_group', 'room_type', 'minimum_nights', 'anomaly_score']].head(10).to_string())

Top 10 High-price Anomaly Listings:
       price neighbourhood_group        room_type  minimum_nights  anomaly_score
46290    999           Manhattan     Private room               1       0.173788
36475    999           Manhattan     Private room               1       0.175465
18387    999           Manhattan  Entire home/apt               2       0.174193
20676    999           Manhattan     Private room               3       0.173976
41123    999           Manhattan     Private room               1       0.173694
10431    999            Brooklyn  Entire home/apt               2       0.182971
9009     999           Manhattan  Entire home/apt               7       0.174468
1891     999           Manhattan     Private room              10       0.175602
14978    999           Manhattan  Entire home/apt               3       0.175460
15040    999           Manhattan  Entire home/apt               2       0.177971

#Top Underpriced Listings (low price anomaly lisiting)
underpriced = results_df[
    results_df['is_anomaly'] & (results_df['price'] < results_df['price'].median())
].sort_values('price', ascending=True)

print("Top 10 Low-price Anomaly Listings:")
print(underpriced[['price', 'neighbourhood_group', 'room_type', 'minimum_nights', 'anomaly_score']].head(10).to_string())

Top 10 Low-price Anomaly Listings:
       price neighbourhood_group        room_type  minimum_nights  anomaly_score
27782     10            Brooklyn  Entire home/apt               1       0.001831
20849     11            Brooklyn  Entire home/apt               2       0.002386
21137     12           Manhattan  Entire home/apt             300       0.928231
45347     13       Staten Island      Shared room               1       0.001975
21589     20               Bronx      Shared room               1       0.001906
36855     20              Queens  Entire home/apt               1       0.008787
28532     25              Queens     Private room               1       0.002432
13978     25            Brooklyn      Shared room               3       0.002803
40909     25              Queens     Private room               1       0.004399
29297     25              Queens     Private room               1       0.002127

# Price distribution: Normal vs Anomaly
plt.figure(figsize=(12, 5))

plt.subplot(1, 2, 1)
sns.histplot(
    data=results_df[results_df['price'] <= 600],
    x='price', hue='is_anomaly',
    bins=60, palette={True: 'red', False: 'steelblue'}, alpha=0.7
)
plt.title('Price Distribution: Normal vs Anomaly')
plt.xlabel('Price ($)')
plt.legend(title='Is Anomaly', labels=['Normal', 'Anomaly'])

plt.subplot(1, 2, 2)
if 'neighbourhood_group' in results_df.columns:
    anomaly_by_area = results_df.groupby('neighbourhood_group')['is_anomaly'].mean().sort_values(ascending=False)
    anomaly_by_area.plot(kind='bar', color='coral', edgecolor='black')
    plt.title('Anomaly Rate by Neighbourhood Group')
    plt.xlabel('Neighbourhood Group')
    plt.ylabel('Anomaly Rate')
    plt.xticks(rotation=30, ha='right')

plt.tight_layout()
plt.savefig('anomaly_analysis.png', dpi=150)
plt.show()

#Saved the output CSV with anomaly labels
output_path = 'airbnb_with_anomaly_scores.csv'
results_df.to_csv(output_path, index=False)
print(f"Saved: {output_path}")

#Summary Statistics
print("\n" + "="*40)
print("        SECTION 4 SUMMARY")
print("="*40)
print(f"Total listings analysed : {len(results_df)}")
print(f"Anomalies detected      : {results_df['is_anomaly'].sum()}")
print(f"Anomaly rate            : {results_df['is_anomaly'].mean()*100:.2f}%")
print(f"Anomaly threshold (MSE) : {THRESHOLD:.6f}")
print(f"Mean anomaly score      : {results_df[results_df['is_anomaly']]['anomaly_score'].mean():.6f}")
print(f"\nOutput columns in CSV:")
print(f"  anomaly_score  → reconstruction error per listing (float)")
print(f"  is_anomaly     → True if listing is anomalous, False if normal (bool)")
print("="*40)

Saved: airbnb_with_anomaly_scores.csv

========================================
        SECTION 4 SUMMARY
========================================
Total listings analysed : 48572
Anomalies detected      : 3287
Anomaly rate            : 6.77%
Anomaly threshold (MSE) : 0.001538
Mean anomaly score      : 0.036315

Output columns in CSV:
  anomaly_score  → reconstruction error per listing (float)
  is_anomaly     → True if listing is anomalous, False if normal (bool)
========================================

#Saved the model for use in Section 5 (NAS Optimisation)
autoencoder.save('autoencoder_base_model.keras')
print("Base model saved as: autoencoder_base_model.keras")
print("This model will be used as the baseline in Section 5 (NAS Optimisation).")

Base model saved as: autoencoder_base_model.keras
This model will be used as the baseline in Section 5 (NAS Optimisation).

import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.neighbors import KNeighborsClassifier
from sklearn import tree

df_import = pd.read_csv("airbnb_cleaned.csv")

df = df_import[["neighbourhood_group", "room_type", "number_of_reviews", "availability_365", "price"]]

df.head()

df.insert(5, "price_category", pd.NA)

df.head()

quartiles = pd.read_csv("5_quartiles.csv")

quartiles

for index, row in quartiles.iterrows():
    df.loc[(df["neighbourhood_group"] == row["neighbourhood_group"]) & (df["room_type"] == row["room_type"]) & (df["price"] < row["Q1"]), ["price_category"]] = 0
    df.loc[(df["neighbourhood_group"] == row["neighbourhood_group"]) & (df["room_type"] == row["room_type"]) & (df["price"] >= row["Q1"]) & (df["price"] <= row["Q3"]), ["price_category"]] = 1
    df.loc[(df["neighbourhood_group"] == row["neighbourhood_group"]) & (df["room_type"] == row["room_type"]) & (df["price"] > row["Q3"]), ["price_category"]] = 2

pd.isna(df["price_category"]).sum()

np.int64(0)

df.head()

neighbourhoods = ["", "Bronx", "Brooklyn", "Manhattan", "Queens", "Staten Island"]
for index, n in enumerate(neighbourhoods):
    df.loc[df["neighbourhood_group"] == n, ["neighbourhood_group"]] = index

room_types = ["Entire home/apt", "Private room", "Shared room"]
for index, r in enumerate(room_types):
    df.loc[df["room_type"] == r, ["room_type"]] = index

df

df = df.astype("int64")

df.dtypes

neighbourhood_group    int64
room_type              int64
number_of_reviews      int64
availability_365       int64
price                  int64
price_category         int64
dtype: object

x_train, x_test, y_train, y_test = train_test_split(df[["neighbourhood_group", "room_type", "price"]], df.price_category, train_size = 0.8)

# K = 3
knn_3 = KNeighborsClassifier(n_neighbors = 3)
knn_3.fit(x_train, y_train)
knn_3.score(x_test, y_test)

0.9947503860010294

# K = 5
knn_5 = KNeighborsClassifier(n_neighbors = 5)
knn_5.fit(x_train, y_train)
knn_5.score(x_test, y_test)

0.9904271744724653

# K = 10
knn_10 = KNeighborsClassifier(n_neighbors = 10)
knn_10.fit(x_train, y_train)
knn_10.score(x_test, y_test)

0.9834276891405044

dtree = tree.DecisionTreeClassifier()
dtree.fit(x_train, y_train)
dtree.score(x_test, y_test)

1.0

x_train, x_test, y_train, y_test = train_test_split(df[["neighbourhood_group", "room_type", "number_of_reviews", "availability_365", "price"]], df.price_category, train_size = 0.8)
dtree.fit(x_train, y_train)
dtree.score(x_test, y_test)

0.9994853319608852

	id	name	host_id	host_name	neighbourhood_group	neighbourhood	latitude	longitude	room_type	price	minimum_nights	number_of_reviews	last_review	reviews_per_month	calculated_host_listings_count	availability_365
0	2539	Clean & quiet apt home by the park	2787	John	Brooklyn	Kensington	40.64749	-73.97237	Private room	149	1	9	2018-10-19	0.21	6	365
1	2595	Skylit Midtown Castle	2845	Jennifer	Manhattan	Midtown	40.75362	-73.98377	Entire home/apt	225	1	45	2019-05-21	0.38	2	355
2	3647	THE VILLAGE OF HARLEM....NEW YORK !	4632	Elisabeth	Manhattan	Harlem	40.80902	-73.94190	Private room	150	3	0	NaN	NaN	1	365
3	3831	Cozy Entire Floor of Brownstone	4869	LisaRoxanne	Brooklyn	Clinton Hill	40.68514	-73.95976	Entire home/apt	89	1	270	2019-07-05	4.64	1	194
4	5022	Entire Apt: Spacious Studio/Loft by central park	7192	Laura	Manhattan	East Harlem	40.79851	-73.94399	Entire home/apt	80	10	9	2018-11-19	0.10	1	0

	id	host_id	latitude	longitude	price	minimum_nights	number_of_reviews	reviews_per_month	calculated_host_listings_count	availability_365
count	4.889500e+04	4.889500e+04	48895.000000	48895.000000	48895.000000	48895.000000	48895.000000	38843.000000	48895.000000	48895.000000
mean	1.901714e+07	6.762001e+07	40.728949	-73.952170	152.720687	7.029962	23.274466	1.373221	7.143982	112.781327
std	1.098311e+07	7.861097e+07	0.054530	0.046157	240.154170	20.510550	44.550582	1.680442	32.952519	131.622289
min	2.539000e+03	2.438000e+03	40.499790	-74.244420	0.000000	1.000000	0.000000	0.010000	1.000000	0.000000
25%	9.471945e+06	7.822033e+06	40.690100	-73.983070	69.000000	1.000000	1.000000	0.190000	1.000000	0.000000
50%	1.967728e+07	3.079382e+07	40.723070	-73.955680	106.000000	3.000000	5.000000	0.720000	1.000000	45.000000
75%	2.915218e+07	1.074344e+08	40.763115	-73.936275	175.000000	5.000000	24.000000	2.020000	2.000000	227.000000
max	3.648724e+07	2.743213e+08	40.913060	-73.712990	10000.000000	1250.000000	629.000000	58.500000	327.000000	365.000000

	id	name	host_id	host_name	neighbourhood_group	neighbourhood	latitude	longitude	room_type	price	minimum_nights	number_of_reviews	last_review	reviews_per_month	calculated_host_listings_count	availability_365
0	2539	Clean & quiet apt home by the park	2787	John	Brooklyn	Kensington	40.64749	-73.97237	Private room	149	1	9	2018-10-19	0.21	6	365
1	2595	Skylit Midtown Castle	2845	Jennifer	Manhattan	Midtown	40.75362	-73.98377	Entire home/apt	225	1	45	2019-05-21	0.38	2	355
2	3647	THE VILLAGE OF HARLEM....NEW YORK !	4632	Elisabeth	Manhattan	Harlem	40.80902	-73.94190	Private room	150	3	0	NaN	NaN	1	365
3	3831	Cozy Entire Floor of Brownstone	4869	LisaRoxanne	Brooklyn	Clinton Hill	40.68514	-73.95976	Entire home/apt	89	1	270	2019-07-05	4.64	1	194
4	5022	Entire Apt: Spacious Studio/Loft by central park	7192	Laura	Manhattan	East Harlem	40.79851	-73.94399	Entire home/apt	80	10	9	2018-11-19	0.10	1	0

	id	name	host_id	host_name	neighbourhood_group	neighbourhood	latitude	longitude	room_type	price	minimum_nights	number_of_reviews	last_review	reviews_per_month	calculated_host_listings_count	availability_365	month	season
0	2539	Clean & quiet apt home by the park	2787	John	Brooklyn	Kensington	40.64749	-73.97237	Private room	149	1	9	2018-10-19	0.21	6	365	10.0	Autumn
1	2595	Skylit Midtown Castle	2845	Jennifer	Manhattan	Midtown	40.75362	-73.98377	Entire home/apt	225	1	45	2019-05-21	0.38	2	355	5.0	Spring
2	3647	THE VILLAGE OF HARLEM....NEW YORK !	4632	Elisabeth	Manhattan	Harlem	40.80902	-73.94190	Private room	150	3	0	NaN	0.00	1	365	NaN	Autumn
3	3831	Cozy Entire Floor of Brownstone	4869	LisaRoxanne	Brooklyn	Clinton Hill	40.68514	-73.95976	Entire home/apt	89	1	270	2019-07-05	4.64	1	194	7.0	Summer
4	5022	Entire Apt: Spacious Studio/Loft by central park	7192	Laura	Manhattan	East Harlem	40.79851	-73.94399	Entire home/apt	80	10	9	2018-11-19	0.10	1	0	11.0	Autumn

	id	host_id	latitude	longitude	price	minimum_nights	number_of_reviews	reviews_per_month	calculated_host_listings_count	availability_365	month
count	4.857200e+04	4.857200e+04	48572.000000	48572.000000	48572.000000	48572.000000	48572.000000	48572.000000	48572.000000	48572.000000	38690.000000
mean	1.902306e+07	6.764521e+07	40.728927	-73.952028	140.269826	6.784176	23.378016	1.095586	7.170345	112.314440	6.174464
std	1.097852e+07	7.861058e+07	0.054582	0.046160	112.904535	16.129464	44.656757	1.600025	33.050706	131.352383	2.529264
min	2.539000e+03	2.438000e+03	40.499790	-74.244420	10.000000	1.000000	0.000000	0.000000	1.000000	0.000000	1.000000
25%	9.476845e+06	7.831209e+06	40.690000	-73.982950	69.000000	1.000000	1.000000	0.040000	1.000000	0.000000	5.000000
50%	1.967743e+07	3.085513e+07	40.722960	-73.955580	105.000000	3.000000	5.000000	0.380000	1.000000	44.000000	6.000000
75%	2.914961e+07	1.074344e+08	40.763130	-73.936100	175.000000	5.000000	24.000000	1.600000	2.000000	225.000000	7.000000
max	3.648724e+07	2.743213e+08	40.913060	-73.712990	999.000000	365.000000	629.000000	58.500000	327.000000	365.000000	12.000000

	id	name	host_id	host_name	neighbourhood_group	neighbourhood	latitude	longitude	room_type	price	minimum_nights	number_of_reviews	last_review	reviews_per_month	calculated_host_listings_count	availability_365
count	4.889500e+04	48879	4.889500e+04	48874	48895	48895	48895.000000	48895.000000	48895	48895.000000	48895.000000	48895.000000	38843	38843.000000	48895.000000	48895.000000
unique	NaN	47905	NaN	11452	5	221	NaN	NaN	3	NaN	NaN	NaN	1764	NaN	NaN	NaN
top	NaN	Hillside Hotel	NaN	Michael	Manhattan	Williamsburg	NaN	NaN	Entire home/apt	NaN	NaN	NaN	2019-06-23	NaN	NaN	NaN
freq	NaN	18	NaN	417	21661	3920	NaN	NaN	25409	NaN	NaN	NaN	1413	NaN	NaN	NaN
mean	1.901714e+07	NaN	6.762001e+07	NaN	NaN	NaN	40.728949	-73.952170	NaN	152.720687	7.029962	23.274466	NaN	1.373221	7.143982	112.781327
std	1.098311e+07	NaN	7.861097e+07	NaN	NaN	NaN	0.054530	0.046157	NaN	240.154170	20.510550	44.550582	NaN	1.680442	32.952519	131.622289
min	2.539000e+03	NaN	2.438000e+03	NaN	NaN	NaN	40.499790	-74.244420	NaN	0.000000	1.000000	0.000000	NaN	0.010000	1.000000	0.000000
25%	9.471945e+06	NaN	7.822033e+06	NaN	NaN	NaN	40.690100	-73.983070	NaN	69.000000	1.000000	1.000000	NaN	0.190000	1.000000	0.000000
50%	1.967728e+07	NaN	3.079382e+07	NaN	NaN	NaN	40.723070	-73.955680	NaN	106.000000	3.000000	5.000000	NaN	0.720000	1.000000	45.000000
75%	2.915218e+07	NaN	1.074344e+08	NaN	NaN	NaN	40.763115	-73.936275	NaN	175.000000	5.000000	24.000000	NaN	2.020000	2.000000	227.000000
max	3.648724e+07	NaN	2.743213e+08	NaN	NaN	NaN	40.913060	-73.712990	NaN	10000.000000	1250.000000	629.000000	NaN	58.500000	327.000000	365.000000

	id	name	host_id	host_name	neighbourhood_group	neighbourhood	latitude	longitude	room_type	price	minimum_nights	number_of_reviews	last_review	reviews_per_month	calculated_host_listings_count	availability_365	month	season
count	4.857200e+04	48572	4.857200e+04	48572	48572	48572	48572.000000	48572.000000	48572	48572.000000	48572.000000	48572.000000	38690	48572.000000	48572.000000	48572.000000	38690.000000	48572
unique	NaN	47591	NaN	11403	5	221	NaN	NaN	3	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	4
top	NaN	Hillside Hotel	NaN	Michael	Manhattan	Williamsburg	NaN	NaN	Entire home/apt	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	Summer
freq	NaN	18	NaN	415	21441	3905	NaN	NaN	25155	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	21136
mean	1.902306e+07	NaN	6.764521e+07	NaN	NaN	NaN	40.728927	-73.952028	NaN	140.269826	6.784176	23.378016	2018-10-04 15:19:20.672008	1.095586	7.170345	112.314440	6.174464	NaN
min	2.539000e+03	NaN	2.438000e+03	NaN	NaN	NaN	40.499790	-74.244420	NaN	10.000000	1.000000	0.000000	2011-03-28 00:00:00	0.000000	1.000000	0.000000	1.000000	NaN
25%	9.476845e+06	NaN	7.831209e+06	NaN	NaN	NaN	40.690000	-73.982950	NaN	69.000000	1.000000	1.000000	2018-07-10 00:00:00	0.040000	1.000000	0.000000	5.000000	NaN
50%	1.967743e+07	NaN	3.085513e+07	NaN	NaN	NaN	40.722960	-73.955580	NaN	105.000000	3.000000	5.000000	2019-05-19 00:00:00	0.380000	1.000000	44.000000	6.000000	NaN
75%	2.914961e+07	NaN	1.074344e+08	NaN	NaN	NaN	40.763130	-73.936100	NaN	175.000000	5.000000	24.000000	2019-06-23 00:00:00	1.600000	2.000000	225.000000	7.000000	NaN
max	3.648724e+07	NaN	2.743213e+08	NaN	NaN	NaN	40.913060	-73.712990	NaN	999.000000	365.000000	629.000000	2019-07-08 00:00:00	58.500000	327.000000	365.000000	12.000000	NaN
std	1.097852e+07	NaN	7.861058e+07	NaN	NaN	NaN	0.054582	0.046160	NaN	112.904535	16.129464	44.656757	NaN	1.600025	33.050706	131.352383	2.529264	NaN

	id	name	host_id	host_name	neighbourhood_group	neighbourhood	latitude	longitude	room_type	price	minimum_nights	number_of_reviews	last_review	reviews_per_month	calculated_host_listings_count	availability_365	month	season	anomaly_score	is_anomaly
13781	10514203	Giant Landmark Apartment in the Sky	3710888	Lisa	Brooklyn	Park Slope	40.67546	-73.97528	Private room	350	365	0	NaN	0.00	1	364	NaN	Autumn	1.769343	True
15267	12328112	GREENPOINT OASIS	1180190	Justin	Brooklyn	Greenpoint	40.73100	-73.95480	Entire home/apt	450	365	17	2019-01-03	0.50	1	365	1.0	Winter	1.765655	True
2814	1586935	Luxury Gramercy Lg 1Bd w Balcony	8457613	Erin	Manhattan	Gramercy	40.73494	-73.98751	Entire home/apt	250	365	0	NaN	0.00	1	365	NaN	Autumn	1.763984	True
4738	3399909	Super cute and sunny 2 bedroom	39304	Andrea	Brooklyn	Williamsburg	40.71852	-73.94165	Entire home/apt	240	365	0	NaN	0.00	1	363	NaN	Autumn	1.761836	True
15859	12916189	Family Friendly BK Townhome With Garden Oasis!	951917	Julia And Juan	Brooklyn	Sunset Park	40.66224	-73.99805	Entire home/apt	196	365	4	2018-05-20	0.12	1	365	5.0	Spring	1.760066	True
17212	13687060	Luxury drmn Bldg + Empire State Views & Roof Top!	21419119	Sebastien	Manhattan	Kips Bay	40.74033	-73.98268	Entire home/apt	189	365	7	2018-03-11	0.19	1	362	3.0	Spring	1.757191	True
1443	649561	Manhattan Sky Crib (1 year sublet)	3260084	David	Manhattan	Chelsea	40.75164	-73.99425	Entire home/apt	135	365	0	NaN	0.00	1	365	NaN	Autumn	1.756533	True
5329	3891031	LAST DAY TO BOOK Bedroom on Wall St	20133610	Shi Qing	Manhattan	Financial District	40.70940	-74.00278	Private room	139	365	13	2015-07-15	0.23	1	365	7.0	Summer	1.756406	True
753	271694	Easy, comfortable studio in Midtown	1387370	James	Manhattan	Midtown	40.75282	-73.97315	Entire home/apt	125	365	19	2015-09-08	0.21	1	365	9.0	Autumn	1.755576	True
2141	992977	Park Slope Pre-War Apartment	4000059	Shahdiya	Brooklyn	Park Slope	40.67359	-73.97434	Entire home/apt	100	365	1	2013-08-01	0.01	1	365	8.0	Summer	1.755009	True

Item	Value
Model type	Autoencoder (Self-Supervised)
Framework	TensorFlow / Keras
Features used	price, minimum_nights, number_of_reviews, reviews_per_month, calculated_host_listings_count, availability_365
Training data	Normal listings only (price £10–500, min_nights ≤ 90)
Anomaly threshold	95th percentile of training reconstruction errors
Output file	`airbnb_with_anomaly_scores.csv`
Saved model	`autoencoder_base_model.keras`

	neighbourhood_group	room_type	number_of_reviews	availability_365	price	price_category
0	Brooklyn	Private room	9	365	149	<NA>
1	Manhattan	Entire home/apt	45	355	225	<NA>
2	Manhattan	Private room	0	365	150	<NA>
3	Brooklyn	Entire home/apt	270	194	89	<NA>
4	Manhattan	Entire home/apt	9	0	80	<NA>

	neighbourhood_group	room_type	Q1	Q3
0	Bronx	Entire home/apt	80.0	140.0
1	Bronx	Private room	40.0	70.0
2	Bronx	Shared room	28.0	55.5
3	Brooklyn	Entire home/apt	104.0	198.2
4	Brooklyn	Private room	50.0	80.0
5	Brooklyn	Shared room	30.0	50.0
6	Manhattan	Entire home/apt	140.0	250.0
7	Manhattan	Private room	67.0	120.0
8	Manhattan	Shared room	49.0	88.0
9	Queens	Entire home/apt	90.0	165.0
10	Queens	Private room	47.0	75.0
11	Queens	Shared room	30.0	50.5
12	Staten Island	Entire home/apt	75.0	150.0
13	Staten Island	Private room	40.0	75.0
14	Staten Island	Shared room	29.0	75.0

	neighbourhood_group	room_type	number_of_reviews	availability_365	price	price_category
0	2	1	9	365	149	2
1	3	0	45	355	225	1
2	3	1	0	365	150	2
3	2	0	270	194	89	0
4	3	0	9	0	80	0
...	...	...	...	...	...	...
48567	2	1	0	9	70	1
48568	2	1	0	36	40	0
48569	3	0	0	27	115	0
48570	3	2	0	2	55	1
48571	3	1	0	23	90	1

Team Project: Airbnb Business Analysis

Data Types¶

Price Variable¶

Neighborhood Variable¶

Room Type Variable¶

Availability Variable¶

Missing Values¶

Duplicate Values Check¶

Unique Values in Categorical Variables¶

Understanding Dataset¶

Airbnb Dataset Cleaning & Preprocessing¶

Objective¶

1. Import Libraries¶

2. Load Dataset¶

3. Dataset Overview¶

4. Convert Columns to Correct Data Types¶

5. Check Missing Values¶

Observations¶

6. Remove Duplicates¶

7. Handle Missing Values¶

8. Remove Invalid Prices¶

9. Remove Extreme Price Outliers¶

10. Remove Extreme Minimum Nights Outliers¶

11. Feature Engineering¶

12. Final Dataset Summary¶

13. Save Cleaned Dataset¶

Conclusion¶

Airbnb Pricing Analysis - Exploratory Data Analysis (EDA)¶

Objective¶

1. Import Libraries¶

2. Load Dataset¶

3. Dataset Overview¶

Observations¶

4. Check Missing Values¶

Observation¶

5. Price Distribution¶

Observation¶

6. Price by Neighborhood¶

Observation¶

7. Room Type Analysis¶

Observation¶

8. Seasonal Trends¶

Observation¶

9. Reviews vs Price¶

Observation¶

10. Availability Analysis¶

Observation¶

11. Correlation Heatmap¶

12. Key Findings¶

Conclusion¶

Section 4 – Self-Supervised Model: Anomaly Detection with an Autoencoder¶

What is an Autoencoder?¶

Step 0 – Install & Import Libraries¶

Step 1 – Load the Cleaned Dataset¶

Step 2 – Select Features for the Autoencoder¶

Step 3 – Prepare Training Data (Normal Listings Only)¶

Step 4 – Build the Autoencoder Model¶

Step 5 – Train the Model¶

Step 6 – Plot Training & Validation Loss¶

Step 7 – Calculate Reconstruction Error (Anomaly Score)¶

Step 8 – Set Anomaly Threshold and Label Anomalies¶

Step 9 – Visualise the Reconstruction Error Distribution¶

Step 10 – Analyse the Anomalies¶

Step 11 – Save the Output Dataset¶

Step 12 – Save the Trained Model¶

Summary¶

Key Findings¶

References¶

Airbnb Dataset NAS (Model Optimisation)¶

Objective¶

Business Question¶

Machine Learning Models¶

1. Import Libraries and Data¶

2. Select Columns for Models¶

3. Create Price Category Column¶

4. Import Quartile Data¶

5. Put Values in Price Category Column¶

6. Check for NA Values¶

7. Convert Categories to Integers¶

8. Change Data Types to Integer¶