In [1]:
%pip install seaborn

Note: you may need to restart the kernel to use updated packages.


In [2]:
import numpy
import pandas as pd
import os
import requests
import json
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.metrics.pairwise import cosine_similarity
from sklearn.preprocessing import StandardScaler,MinMaxScaler, OneHotEncoder,StandardScaler
from sklearn.decomposition import PCA
from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.decomposition import TruncatedSVD
from sklearn.metrics.pairwise import cosine_similarity
from sklearn.manifold import TSNE
from sklearn.compose import ColumnTransformer
%matplotlib inline
import matplotlib.pyplot as plt
from mpl_toolkits.mplot3d import Axes3D

import seaborn as sns
pd.options.display.max_rows = 999

In [3]:
url = "https://api.repliers.io/listings?propertyType=detached&class=residential"

headers = {
    "accept": "application/json",
    "content-type": "application/json",
    "REPLIERS-API-KEY": ""
}

post = requests.post(url, headers=headers)
response_dict = json.loads(post.text)
df = pd.json_normalize(response_dict['listings'])
pd.set_option('display.max_columns', None)

In [4]:
df.columns.to_list()

['mlsNumber',
 'resource',
 'status',
 'class',
 'type',
 'listPrice',
 'listDate',
 'lastStatus',
 'soldPrice',
 'soldDate',
 'originalPrice',
 'assignment',
 'images',
 'photoCount',
 'daysOnMarket',
 'occupancy',
 'updatedOn',
 'coopCompensation',
 'openHouse',
 'rooms',
 'agents',
 'boardId',
 'address.area',
 'address.city',
 'address.country',
 'address.district',
 'address.majorIntersection',
 'address.neighborhood',
 'address.streetDirection',
 'address.streetName',
 'address.streetNumber',
 'address.streetSuffix',
 'address.unitNumber',
 'address.zip',
 'address.state',
 'address.communityCode',
 'address.streetDirectionPrefix',
 'map.latitude',
 'map.longitude',
 'map.point',
 'permissions.displayAddressOnInternet',
 'permissions.displayPublic',
 'permissions.displayInternetEntireListing',
 'details.airConditioning',
 'details.basement1',
 'details.basement2',
 'details.centralVac',
 'details.den',
 'details.description',
 'details.elevator',
 'details.exteriorConstruction1',

In [5]:
df.isna().mean().reset_index().sort_values(by=0)

Unnamed: 0,index,0
0,mlsNumber,0.0
37,map.latitude,0.0
38,map.longitude,0.0
39,map.point,0.0
40,permissions.displayAddressOnInternet,0.0
41,permissions.displayPublic,0.0
42,permissions.displayInternetEntireListing,0.0
43,details.airConditioning,0.0
44,details.basement1,0.0
48,details.description,0.0


In [6]:
na_percentage = df.isna().mean().reset_index()
df = df[[x for x in df.columns if x not in list(na_percentage[na_percentage[0] >= 0.5]['index'])]]

In [7]:
df.columns.to_list()

['mlsNumber',
 'resource',
 'status',
 'class',
 'type',
 'listPrice',
 'listDate',
 'lastStatus',
 'soldPrice',
 'originalPrice',
 'images',
 'photoCount',
 'daysOnMarket',
 'occupancy',
 'updatedOn',
 'coopCompensation',
 'openHouse',
 'rooms',
 'agents',
 'boardId',
 'address.area',
 'address.city',
 'address.district',
 'address.majorIntersection',
 'address.neighborhood',
 'address.streetName',
 'address.streetNumber',
 'address.streetSuffix',
 'address.zip',
 'address.state',
 'address.communityCode',
 'map.latitude',
 'map.longitude',
 'map.point',
 'permissions.displayAddressOnInternet',
 'permissions.displayPublic',
 'permissions.displayInternetEntireListing',
 'details.airConditioning',
 'details.basement1',
 'details.den',
 'details.description',
 'details.exteriorConstruction1',
 'details.extras',
 'details.garage',
 'details.heating',
 'details.numBathrooms',
 'details.numBedrooms',
 'details.numFireplaces',
 'details.numGarageSpaces',
 'details.numParkingSpaces',
 'detail

In [8]:
for i in df.columns.to_list():
    print(df[i].describe())

count           100
unique           84
top       X10421506
freq              2
Name: mlsNumber, dtype: object
count               100
unique                2
top       Property:2381
freq                 59
Name: resource, dtype: object
count     100
unique      1
top         A
freq      100
Name: status, dtype: object
count                     100
unique                      1
top       ResidentialProperty
freq                      100
Name: class, dtype: object
count      100
unique       2
top       Sale
freq        78
Name: type, dtype: object
count    1.000000e+02
mean     1.044883e+06
std      9.264320e+05
min      1.750000e+03
25%      3.750000e+05
50%      8.919440e+05
75%      1.382425e+06
max      4.299000e+06
Name: listPrice, dtype: float64
count                               100
unique                                1
top       2024-11-13T00:00:00.000-00:00
freq                                100
Name: listDate, dtype: object
count     100
unique      1
top       New
freq  

In [9]:
feature_list = ['mlsNumber',
 'listPrice',
 'lastStatus',
 'soldPrice',
 'daysOnMarket',
 'address.district',
 'map.latitude',
 'map.longitude',
 'details.airConditioning',
 'details.basement1',
 'details.den',
 'details.exteriorConstruction1',
 'details.garage',
 'details.numBathrooms',
 'details.numBedrooms',
 'details.numFireplaces',
 'details.numGarageSpaces',
 'details.numParkingSpaces',
 'details.numRooms',
 'details.numRoomsPlus',
 'details.propertyType',
 'details.sqft',
 'details.style',
 'details.yearBuilt',
 'details.familyRoom',
 'details.numKitchens',
 'lot.depth',
 'lot.width',
 'nearby.amenities',
 'taxes.annualAmount']

In [10]:
df = df[feature_list]


In [11]:
df.head()

Unnamed: 0,mlsNumber,listPrice,lastStatus,soldPrice,daysOnMarket,address.district,map.latitude,map.longitude,details.airConditioning,details.basement1,details.den,details.exteriorConstruction1,details.garage,details.numBathrooms,details.numBedrooms,details.numFireplaces,details.numGarageSpaces,details.numParkingSpaces,details.numRooms,details.numRoomsPlus,details.propertyType,details.sqft,details.style,details.yearBuilt,details.familyRoom,details.numKitchens,lot.depth,lot.width,nearby.amenities,taxes.annualAmount
0,X10421506,688000,New,,,,43.105279,-79.069911,Central Air,"Finished, Separate Entrance",,"Vinyl Siding, Brick",,5,6,1,,,10,2.0,Detached,1500-2000,2-Storey,100+,Y,3,100.0,32.61,[],2500.34
1,N10421482,1738000,New,0.0,0.0,Richmond Hill,43.901048,-79.46148,Central Air,Full,Y,Brick,Attached,3,4,Y,2.0,5.0,8,1.0,Detached,2500-3000,2-Storey,,Y,1,113.32,36.09,[],7210.0
2,W10421557,825000,New,0.0,0.0,Toronto W04,43.706359,-79.529639,Central Air,Finished,N,Brick Front,,2,2,N,0.0,0.0,6,2.0,Detached,,2-Storey,,N,1,100.0,20.0,"[Library, Park, Public Transit, School]",2703.8
3,X10421506,688000,New,0.0,0.0,Niagara Falls,43.10529,-79.070014,Central Air,Finished,Y,Brick,,5,6,Y,0.0,6.0,10,2.0,Detached,1500-2000,2-Storey,100+,Y,3,100.0,32.61,[],2500.34
4,W10421494,799900,New,0.0,0.0,Toronto W03,43.679507,-79.499163,Central Air,Finished,N,Alum Siding,Detached,2,3,N,1.0,3.0,8,4.0,Detached,1100-1500,2-Storey,,N,1,125.0,25.0,"[Golf, Library, Public Transit, Rec Centre, Sc...",3998.47


In [13]:
# model = PCA()
# pca_features = model.fit_transform(df_numerics)
# first_pc = model.components_[0,:]



In [14]:
df.head()

Unnamed: 0,mlsNumber,listPrice,lastStatus,soldPrice,daysOnMarket,address.district,map.latitude,map.longitude,details.airConditioning,details.basement1,details.den,details.exteriorConstruction1,details.garage,details.numBathrooms,details.numBedrooms,details.numFireplaces,details.numGarageSpaces,details.numParkingSpaces,details.numRooms,details.numRoomsPlus,details.propertyType,details.sqft,details.style,details.yearBuilt,details.familyRoom,details.numKitchens,lot.depth,lot.width,nearby.amenities,taxes.annualAmount
0,X10421506,688000,New,,,,43.105279,-79.069911,Central Air,"Finished, Separate Entrance",,"Vinyl Siding, Brick",,5,6,1,,,10,2.0,Detached,1500-2000,2-Storey,100+,Y,3,100.0,32.61,[],2500.34
1,N10421482,1738000,New,0.0,0.0,Richmond Hill,43.901048,-79.46148,Central Air,Full,Y,Brick,Attached,3,4,Y,2.0,5.0,8,1.0,Detached,2500-3000,2-Storey,,Y,1,113.32,36.09,[],7210.0
2,W10421557,825000,New,0.0,0.0,Toronto W04,43.706359,-79.529639,Central Air,Finished,N,Brick Front,,2,2,N,0.0,0.0,6,2.0,Detached,,2-Storey,,N,1,100.0,20.0,"[Library, Park, Public Transit, School]",2703.8
3,X10421506,688000,New,0.0,0.0,Niagara Falls,43.10529,-79.070014,Central Air,Finished,Y,Brick,,5,6,Y,0.0,6.0,10,2.0,Detached,1500-2000,2-Storey,100+,Y,3,100.0,32.61,[],2500.34
4,W10421494,799900,New,0.0,0.0,Toronto W03,43.679507,-79.499163,Central Air,Finished,N,Alum Siding,Detached,2,3,N,1.0,3.0,8,4.0,Detached,1100-1500,2-Storey,,N,1,125.0,25.0,"[Golf, Library, Public Transit, Rec Centre, Sc...",3998.47


## ETL and Modelling Stage
#### Above code removes missing or overly detailed features
#### Things to do
#### 1) Transform nearby.amenities into one hots (done)
#### 2) Extract address.district, details.airConditioning	details.basement1	details.den	details.exteriorConstruction1	details.extras	###details.garage
#### details.propertyType	details.style	details.swimmingPool	details.yearBuilt	details.familyRoom	details.driveway
#### 3) Lat long scaling
#### 4) Map details.sqft to numerics (done)
#### 5) 'lot.depth' * 'lot.width' feature (done)
#### 6) Look at time metrics 
#### 6) StandardScaler on all numerics
#### 7) Keep one hots and test out


In [15]:
one_hot_cols = ['lastStatus','address.district','details.airConditioning','details.basement1','details.den','details.exteriorConstruction1','details.garage','details.propertyType','details.style','details.familyRoom']
numeric_cols = [ 'soldPrice','listPrice','daysOnMarket','details.numBathrooms','details.numBedrooms','details.numFireplaces','details.numGarageSpaces','details.numParkingSpaces','details.numRooms','details.numRoomsPlus','details.numKitchens','taxes.annualAmount','lot_size','details.sqft','details.yearBuilt']	

In [16]:
for cols in one_hot_cols:
    print(cols)
    print(df[cols].nunique())

lastStatus
1
address.district
43
details.airConditioning
4
details.basement1
22
details.den
2
details.exteriorConstruction1
23
details.garage
6
details.propertyType
1
details.style
10
details.familyRoom
2


In [17]:
unique_amens = set([item for sublist in df['nearby.amenities'] for item in sublist])
unique_amens

{'Beach',
 'Clear View',
 'Cul De Sac',
 'Cul de Sac/Dead End',
 'Electric Car Charger',
 'Fenced Yard',
 'Golf',
 'Greenbelt/Conservation',
 'Hospital',
 'Lake Access',
 'Lake/Pond',
 'Level',
 'Library',
 'Marina',
 'Park',
 'Place Of Worship',
 'Public Transit',
 'Ravine',
 'Rec Centre',
 'Rec./Commun.Centre',
 'River/Stream',
 'School',
 'School Bus Route',
 'Skiing',
 'Waterfront',
 'Wooded/Treed'}

In [18]:
amenity_cols = [ 'School','Greenbelt/Conservation', 'Public Transit', 'Park', 'Public Transit','Rec Centre', 'Waterfront',]
df['school'] = [1 if 'School' in x else 0 for x in df['nearby.amenities']]
df['greenbelt'] = [1 if '/Conserv' in x else 0 for x in df['nearby.amenities']]
df['park'] = [1 if 'Park' in x else 0 for x in df['nearby.amenities']]
df['transit'] = [1 if 'Transit' in x else 0 for x in df['nearby.amenities']]
df['rec_center'] = [1 if 'Centre' in x else 0 for x in df['nearby.amenities']]
df['waterfront'] = [1 if 'Waterfront' in x else 0 for x in df['nearby.amenities']]
df.drop(columns = 'nearby.amenities',inplace=True)

In [19]:
df['details.sqft'].value_counts()

details.sqft
1500-2000    12
2000-2500    12
1100-1500    11
2500-3000     5
3000-3500     4
700-1100      4
3500-5000     3
5000 +        1
Name: count, dtype: int64

In [20]:
df['details.yearBuilt'].value_counts()

details.yearBuilt
51-99    12
0-5      10
6-15      9
100+      8
31-50     8
16-30     4
New       1
Name: count, dtype: int64

In [21]:
df['details.sqft'].replace({'1500-2000':1750,
'2000-2500':2250,
'1100-1500':1300,
'700-1100' :850,
'500-700':600,
'< 500':250,
'3500-5000':4250,
'2500-3000':2750,
'3000-3500':3250,
'3500-4000':3750,
'4000-5000':4500,
'5000 +':5000},inplace=True)

df['details.yearBuilt'].replace({'0-5':2.5,
'51-99':75,
'6-15':11,
'31-50':40,
'100+':100,
'16-30':22,
'New':0},inplace=True)
df['details.numFireplaces'].replace({'Y':1,'N':0},inplace=True)


df['lot.depth'] = df['lot.depth'].astype(float)
df['lot.width'] = df['lot.width'].astype(float)
df['lot_size'] = df['lot.depth']* df['lot.width']
df.drop(columns = ['lot.depth','lot.width'],inplace=True)

The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  df['details.sqft'].replace({'1500-2000':1750,
  df['details.sqft'].replace({'1500-2000':1750,
The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  df['details.yearBuilt'].replace({'0-5':2.5,
  df['details.yearBuilt'].replace({'0-5':2.5,
The behavior will change in pandas 3.0. This inp

In [22]:
[x for x in df.columns if x not in numeric_cols and x not in one_hot_cols]

['mlsNumber',
 'map.latitude',
 'map.longitude',
 'school',
 'greenbelt',
 'park',
 'transit',
 'rec_center',
 'waterfront']

In [44]:
scaler_pipeline = Pipeline([
    ('imputer', SimpleImputer(strategy="median")),
    ('std_scaler', StandardScaler())
])

one_hot_pipeline = Pipeline([
    ('imputer', SimpleImputer(strategy="most_frequent")),
    ('one_hot', OneHotEncoder(sparse_output=False))
])

pipeline = ColumnTransformer([
    ('scalar', scaler_pipeline, numeric_cols),
    ('one_hot', one_hot_pipeline, one_hot_cols)
], remainder='passthrough')

pipeline.fit(df[[x for x in df.columns if x!= 'mlsNumber']])

# Transform the data (this will apply one-hot encoding and leave the numeric columns unchanged)
transformed_data = pipeline.transform(df[[x for x in df.columns if x!= 'mlsNumber']])

In [45]:
transformed_data[0]

array([  0.        ,  -0.3871643 ,   0.        ,   1.29267118,
         2.81464583,   0.51291512,   0.38943391,  -0.19218434,
         0.81875522,  -1.32022608,   5.45668211,  -0.84223191,
        -0.15464707,  -0.27685832,   2.2459695 ,   1.        ,
         0.        ,   0.        ,   0.        ,   0.        ,
         0.        ,   0.        ,   0.        ,   0.        ,
         0.        ,   0.        ,   0.        ,   0.        ,
         0.        ,   0.        ,   0.        ,   0.        ,
         0.        ,   0.        ,   0.        ,   0.        ,
         0.        ,   0.        ,   0.        ,   0.        ,
         0.        ,   0.        ,   0.        ,   0.        ,
         0.        ,   0.        ,   0.        ,   0.        ,
         0.        ,   0.        ,   0.        ,   0.        ,
         0.        ,   0.        ,   0.        ,   0.        ,
         0.        ,   0.        ,   0.        ,   1.        ,
         1.        ,   0.        ,   0.        ,   0.  

In [51]:
df.head(10)

Unnamed: 0,mlsNumber,listPrice,lastStatus,soldPrice,daysOnMarket,address.district,map.latitude,map.longitude,details.airConditioning,details.basement1,details.den,details.exteriorConstruction1,details.garage,details.numBathrooms,details.numBedrooms,details.numFireplaces,details.numGarageSpaces,details.numParkingSpaces,details.numRooms,details.numRoomsPlus,details.propertyType,details.sqft,details.style,details.yearBuilt,details.familyRoom,details.numKitchens,taxes.annualAmount,school,greenbelt,park,transit,rec_center,waterfront,lot_size
0,X10421506,688000,New,,,,43.105279,-79.069911,Central Air,"Finished, Separate Entrance",,"Vinyl Siding, Brick",,5,6,1.0,,,10,2.0,Detached,1750.0,2-Storey,100.0,Y,3,2500.34,0,0,0,0,0,0,3261.0
1,N10421482,1738000,New,0.0,0.0,Richmond Hill,43.901048,-79.46148,Central Air,Full,Y,Brick,Attached,3,4,1.0,2.0,5.0,8,1.0,Detached,2750.0,2-Storey,,Y,1,7210.0,0,0,0,0,0,0,4089.7188
2,W10421557,825000,New,0.0,0.0,Toronto W04,43.706359,-79.529639,Central Air,Finished,N,Brick Front,,2,2,0.0,0.0,0.0,6,2.0,Detached,,2-Storey,,N,1,2703.8,1,0,1,0,0,0,2000.0
3,X10421506,688000,New,0.0,0.0,Niagara Falls,43.10529,-79.070014,Central Air,Finished,Y,Brick,,5,6,1.0,0.0,6.0,10,2.0,Detached,1750.0,2-Storey,100.0,Y,3,2500.34,0,0,0,0,0,0,3261.0
4,W10421494,799900,New,0.0,0.0,Toronto W03,43.679507,-79.499163,Central Air,Finished,N,Alum Siding,Detached,2,3,0.0,1.0,3.0,8,4.0,Detached,1300.0,2-Storey,,N,1,3998.47,1,0,0,0,0,0,3125.0
5,X10421567,869990,New,,,,43.07789,-79.931238,Central Air,"Full, Unfinished",,"Brick Front, Vinyl Siding",Built-In,3,4,,1.5,,7,,Detached,1750.0,2-Storey,2.5,N,3,5188.0,0,0,0,0,0,0,3603.3734
6,W10421720,999999,New,,,Orangeville,43.900274,-80.126873,Central Air,Finished,,"Brick, Vinyl Siding",Attached,3,3,,2.0,,8,2.0,Detached,,Backsplit 4,,N,1,6600.0,0,0,0,0,0,0,4800.0
7,W10421807,2688000,New,,,Toronto W04,43.722066,-79.455277,Central Air,"Finished, Walk-Up",,"Stone, Stucco (Plaster)",Built-In,5,4,,2.0,,11,,Detached,4250.0,2-Storey,2.5,Y,1,10050.0,0,0,0,0,0,0,5200.0
8,X10421990,799900,New,0.0,0.0,Welland,43.019035,-79.254671,Central Air,Full,N,Brick,Attached,2,3,1.0,1.5,4.0,7,5.0,Detached,1300.0,Backsplit 3,40.0,N,1,4636.29,1,0,0,0,0,0,6023.9808
9,X10421773,2700,New,0.0,0.0,Brantford,43.11103,-80.30357,Central Air,Full,N,Brick,Built-In,3,3,0.0,1.0,2.0,6,,Detached,1300.0,2-Storey,2.5,N,1,,1,0,1,0,0,0,2478.6


## Cosine Similarity Model
####here just looking at reducing dimensions and then generating lists
#

In [69]:
def get_listings(listing_id, cosine_sim, df, n_listing = 10):
    idx = df[df['mlsNumber'] == listing_id].index[0]
    
    sim = list(enumerate(cosine_sim[idx]))
    sim = sorted(sim, key=lambda x: x[1], reverse=True)
    
    sim_output = sim[1:n_listing-1]
    similar_indices = [i[0] for i in sim_output]
    
    return df['mlsNumber'].iloc[similar_indices], df.iloc[similar_indices]

In [70]:
similarities = cosine_similarity(transformed_data, transformed_data)


In [77]:

recommended_ids, recommended_listings = get_listings('W10421807', similarities, df, 20)
print("Recommended listings for :")
print(recommended_ids)

Recommended listings for :
13    N10422086
78    W10421725
38    N10421524
57    C10421593
87    W10421826
65    W10421764
43    X10421633
84    N10421740
39    X10422355
1     N10421482
15    X10421966
73    X10421836
89    W10422194
76    N10421891
59    N10421532
91    C10422413
64    W10421764
83    W10421723
Name: mlsNumber, dtype: object


In [78]:
df[df.mlsNumber == 'W10421807']

Unnamed: 0,mlsNumber,listPrice,lastStatus,soldPrice,daysOnMarket,address.district,map.latitude,map.longitude,details.airConditioning,details.basement1,details.den,details.exteriorConstruction1,details.garage,details.numBathrooms,details.numBedrooms,details.numFireplaces,details.numGarageSpaces,details.numParkingSpaces,details.numRooms,details.numRoomsPlus,details.propertyType,details.sqft,details.style,details.yearBuilt,details.familyRoom,details.numKitchens,taxes.annualAmount,school,greenbelt,park,transit,rec_center,waterfront,lot_size
7,W10421807,2688000,New,,,Toronto W04,43.722066,-79.455277,Central Air,"Finished, Walk-Up",,"Stone, Stucco (Plaster)",Built-In,5,4,,2.0,,11,,Detached,4250.0,2-Storey,2.5,Y,1,10050.0,0,0,0,0,0,0,5200.0


In [79]:

recommended_listings

Unnamed: 0,mlsNumber,listPrice,lastStatus,soldPrice,daysOnMarket,address.district,map.latitude,map.longitude,details.airConditioning,details.basement1,details.den,details.exteriorConstruction1,details.garage,details.numBathrooms,details.numBedrooms,details.numFireplaces,details.numGarageSpaces,details.numParkingSpaces,details.numRooms,details.numRoomsPlus,details.propertyType,details.sqft,details.style,details.yearBuilt,details.familyRoom,details.numKitchens,taxes.annualAmount,school,greenbelt,park,transit,rec_center,waterfront,lot_size
13,N10422086,2199999,New,0.0,0.0,King,43.903174,-79.661909,Central Air,Full,Y,Stone,Attached,5,5,1.0,2.0,4.0,10,,Detached,4250.0,2-Storey,2.5,Y,1,10284.12,0,0,0,0,0,0,5976.0
78,W10421725,1398000,New,,,,43.711405,-79.470638,Central Air,"Separate Entrance, Unfinished",,Brick,Attached,4,4,1.0,2.0,,12,,Detached,3250.0,2-Storey,,Y,1,6924.0,1,0,0,0,0,0,9606.5
38,N10421524,2888000,New,0.0,0.0,Markham,43.886092,-79.3185,Central Air,Apartment,Y,Brick,Built-In,5,4,1.0,2.0,6.0,18,,Detached,3250.0,2-Storey,2.5,Y,1,10417.76,0,0,0,0,0,0,4876.1524
57,C10421593,3388000,New,,,Toronto C14,43.782536,-79.392375,Central Air,"Finished, Walk-Out",,"Brick, Stone",Built-In,8,5,,2.0,,10,4.0,Detached,4250.0,2-Storey,0.0,Y,1,20571.71,0,0,0,0,0,0,7135.0
87,W10421826,1950000,New,0.0,0.0,Caledon,43.874258,-79.869977,Central Air,Full,Y,Brick,Attached,4,4,1.0,2.0,10.0,9,,Detached,2750.0,2-Storey,40.0,Y,1,6726.0,0,0,0,0,0,0,15181.8381
65,W10421764,1149000,New,,,,43.712789,-79.840579,Central Air,Finished,,Brick,Attached,4,4,,2.0,,9,,Detached,2250.0,2-Storey,11.0,N,1,6696.45,0,0,0,0,0,0,3371.3548
43,X10421633,1079000,New,,,,43.39672,-80.382098,Central Air,Unfinished,,"Aluminum Siding, Brick Front",Built-In,4,4,,2.0,,7,,Detached,2250.0,2-Storey,11.0,Y,1,6198.0,1,0,1,0,0,0,3446.0343
84,N10421740,1390000,New,0.0,0.0,Innisfil,44.29337,-79.552245,Central Air,Part Fin,N,Brick,Attached,5,4,0.0,2.0,4.0,12,,Detached,3250.0,2-Storey,,N,1,6361.0,0,0,0,0,0,1,7850.59
39,X10422355,3299900,New,,,Hamilton,43.237631,-79.959969,Central Air,"Full, Unfinished",,"Stone, Stucco (Plaster)",Attached,6,4,1.0,2.0,,11,,Detached,5000.0,2-Storey,2.5,Y,1,25059.84,1,0,1,0,0,0,11924.752
1,N10421482,1738000,New,0.0,0.0,Richmond Hill,43.901048,-79.46148,Central Air,Full,Y,Brick,Attached,3,4,1.0,2.0,5.0,8,1.0,Detached,2750.0,2-Storey,,Y,1,7210.0,0,0,0,0,0,0,4089.7188


In [76]:
df.head(30)

Unnamed: 0,mlsNumber,listPrice,lastStatus,soldPrice,daysOnMarket,address.district,map.latitude,map.longitude,details.airConditioning,details.basement1,details.den,details.exteriorConstruction1,details.garage,details.numBathrooms,details.numBedrooms,details.numFireplaces,details.numGarageSpaces,details.numParkingSpaces,details.numRooms,details.numRoomsPlus,details.propertyType,details.sqft,details.style,details.yearBuilt,details.familyRoom,details.numKitchens,taxes.annualAmount,school,greenbelt,park,transit,rec_center,waterfront,lot_size
0,X10421506,688000,New,,,,43.105279,-79.069911,Central Air,"Finished, Separate Entrance",,"Vinyl Siding, Brick",,5,6,1.0,,,10,2.0,Detached,1750.0,2-Storey,100.0,Y,3,2500.34,0,0,0,0,0,0,3261.0
1,N10421482,1738000,New,0.0,0.0,Richmond Hill,43.901048,-79.46148,Central Air,Full,Y,Brick,Attached,3,4,1.0,2.0,5.0,8,1.0,Detached,2750.0,2-Storey,,Y,1,7210.0,0,0,0,0,0,0,4089.7188
2,W10421557,825000,New,0.0,0.0,Toronto W04,43.706359,-79.529639,Central Air,Finished,N,Brick Front,,2,2,0.0,0.0,0.0,6,2.0,Detached,,2-Storey,,N,1,2703.8,1,0,1,0,0,0,2000.0
3,X10421506,688000,New,0.0,0.0,Niagara Falls,43.10529,-79.070014,Central Air,Finished,Y,Brick,,5,6,1.0,0.0,6.0,10,2.0,Detached,1750.0,2-Storey,100.0,Y,3,2500.34,0,0,0,0,0,0,3261.0
4,W10421494,799900,New,0.0,0.0,Toronto W03,43.679507,-79.499163,Central Air,Finished,N,Alum Siding,Detached,2,3,0.0,1.0,3.0,8,4.0,Detached,1300.0,2-Storey,,N,1,3998.47,1,0,0,0,0,0,3125.0
5,X10421567,869990,New,,,,43.07789,-79.931238,Central Air,"Full, Unfinished",,"Brick Front, Vinyl Siding",Built-In,3,4,,1.5,,7,,Detached,1750.0,2-Storey,2.5,N,3,5188.0,0,0,0,0,0,0,3603.3734
6,W10421720,999999,New,,,Orangeville,43.900274,-80.126873,Central Air,Finished,,"Brick, Vinyl Siding",Attached,3,3,,2.0,,8,2.0,Detached,,Backsplit 4,,N,1,6600.0,0,0,0,0,0,0,4800.0
7,W10421807,2688000,New,,,Toronto W04,43.722066,-79.455277,Central Air,"Finished, Walk-Up",,"Stone, Stucco (Plaster)",Built-In,5,4,,2.0,,11,,Detached,4250.0,2-Storey,2.5,Y,1,10050.0,0,0,0,0,0,0,5200.0
8,X10421990,799900,New,0.0,0.0,Welland,43.019035,-79.254671,Central Air,Full,N,Brick,Attached,2,3,1.0,1.5,4.0,7,5.0,Detached,1300.0,Backsplit 3,40.0,N,1,4636.29,1,0,0,0,0,0,6023.9808
9,X10421773,2700,New,0.0,0.0,Brantford,43.11103,-80.30357,Central Air,Full,N,Brick,Built-In,3,3,0.0,1.0,2.0,6,,Detached,1300.0,2-Storey,2.5,N,1,,1,0,1,0,0,0,2478.6
