import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
= [
color '#ADCE74',
'#A2738C',
'#82DBD8',
'#D3C09A',
'#EC8F6A',
'#6BBA62',
'#F3D516',
'#FFCB3C',
'#FF677D',
'#EADADA',
'#999B84'
]
import os
= pd.read_csv('RealEstateAU_1000_Samples.csv') df
print(df.shape)
print(df.duplicated().sum())
= pd.DataFrame({
tabela 'Unique':df.nunique(),
'Null':df.isna().sum(),
'NullPercent':df.isna().sum() / len(df),
'Types':df.dtypes.values
}) display(tabela)
(1000, 27)
0
Unique | Null | NullPercent | Types | |
---|---|---|---|---|
index | 1000 | 0 | 0.000 | int64 |
TID | 1000 | 0 | 0.000 | int64 |
breadcrumb | 2 | 0 | 0.000 | object |
category_name | 2 | 0 | 0.000 | object |
property_type | 13 | 0 | 0.000 | object |
building_size | 169 | 720 | 0.720 | object |
land_size | 346 | 467 | 0.467 | object |
preferred_size | 376 | 391 | 0.391 | object |
open_date | 15 | 698 | 0.698 | object |
listing_agency | 85 | 0 | 0.000 | object |
price | 494 | 0 | 0.000 | object |
location_number | 889 | 0 | 0.000 | int64 |
location_type | 1 | 0 | 0.000 | object |
location_name | 494 | 0 | 0.000 | object |
address | 882 | 12 | 0.012 | object |
address_1 | 882 | 12 | 0.012 | object |
city | 56 | 0 | 0.000 | object |
state | 1 | 0 | 0.000 | object |
zip_code | 13 | 0 | 0.000 | int64 |
phone | 84 | 0 | 0.000 | object |
latitude | 0 | 1000 | 1.000 | float64 |
longitude | 0 | 1000 | 1.000 | float64 |
product_depth | 4 | 0 | 0.000 | object |
bedroom_count | 10 | 33 | 0.033 | float64 |
bathroom_count | 5 | 33 | 0.033 | float64 |
parking_count | 11 | 33 | 0.033 | float64 |
RunDate | 1 | 0 | 0.000 | object |
'index','latitude','longitude','state','location_type','location_name','address_1','breadcrumb','RunDate'],axis=1,inplace=True)
df.drop([
= ['category_name','listing_agency','city']
colunas for i in colunas:
= df[i].str.upper()
df[i]
= ['address']
colunas for col in colunas:
= df[col].fillna('Unknow')
df[col]
= ['building_size','preferred_size','land_size']
colunas for col in colunas:
= df[col].str.replace('m²','').str.replace('ha','')
df[col]
print('ok')
ok
= df['listing_agency'].str.split('-',1,expand=True)
colunas 'agency_names'] = colunas[0]
df['agency_names2'] = colunas[1]
df['agency_names2'] = df['agency_names2'].str.replace(' ','')
df['agency_names2'] = df['agency_names2'].replace(' ','Unknown')
df['agency_names2'] = df['agency_names2'].replace( ' ','Unknown')
df[# espaçamento existente na linha
'agency_names'] = df['agency_names'].replace( 'FOR SALE BY OWNER ','FOR SALE BY OWNER') df[
'price'] = df['price'].replace('$1.15m','$1.150000')
df['price'] = df['price'].replace('JUST LIKE THAT: UNDER CONTRACT IN 7 DAYS','JUST LIKE THAT: UNDER CONTRACT IN SEVEN DAYS')
df['price'] = df['price'].replace('Offers over $1.2m','Offers over $1.200000')
df['price'] = df['price'].replace('Auction Wednesday 1st of June 2022','Auction')
df['price'] = df['price'].replace('Auction 8th June on site','Auction')
df['price'] = df['price'].replace('Auction - Wednesday 15th June 2022 at 5.30pm','Auction')
df['price'] = df['price'].replace('AUCTION: Saturday 4th Jun @11am On-Site',"AUCTION")
df['price'] = df['price'].replace('JUST LIKE THAT: UNDER CONTRACT IN 5 DAYS','JUST LIKE THAT: UNDER CONTRACT IN FIVE DAYS')
df['price'] = df['price'].replace('Offers Over $500,000 - Offers by 6.30pm 22/6/22','Offers Over $500,000')
df['price'] = df['price'].replace('Negotiable Above 1.5M','Negotiable Above 1.500000')
df['price'] = df['price'].replace('$147 000','$147,000')
df['price'] = df['price'].replace('$369 000','$369,000')
df['price'] = df['price'].replace('$545,000 ( over 1000sqm of land)','$545,000 ( over thousand sqm of land)')
df['price'] = df['price'].replace('$450 000','$450,000') df[
'property_type'].unique() df[
array(['House', 'Apartment', 'Unit', 'Studio', 'Residential Land',
'Block Of Units', 'Townhouse', 'Acreage', 'Duplex/Semi-detached',
'Other', 'Villa', 'Warehouse', 'Lifestyle'], dtype=object)
= df['price'].str.split('-',1,expand=True)
colunas 'price'] = colunas[0]
df['priceConsidered'] = colunas[1]
df[
= df['price'].str.split(' ',1,expand=True)
colunas 'price'] = colunas[0]
df['priceConsidered'] = colunas[1] df[
'priceCondition'] = df['priceConsidered'].replace(r'[0-9]','',regex=True)
df['priceCondition'] = df['priceCondition'].str.replace('$','',regex=True)
df['priceCondition'] = df['priceCondition'].str.replace('!','',regex=True).str.replace(' ,','',regex=True)
df['priceCondition'] = df['priceCondition'].str.upper()
df[# removendo letras
'priceConsidered'] = df['priceConsidered'].replace('[^0-9]','',regex=True)
df[
# removendo letras
'price'] = df['price'].replace('[^0-9]','',regex=True)
df[# removendo símbolo
'price'] = df['price'].replace('$','',regex=True).replace('-$','',regex=True)
df[
'price'] = df['price'].replace('',np.nan)
df['price'] = df['price'].fillna(df['priceConsidered'])
df['price'] = pd.to_numeric(df['price'], errors='coerce') df[
'land_size'] = pd.to_numeric(df['land_size'], errors='coerce')
df['preferred_size'] = pd.to_numeric(df['preferred_size'], errors='coerce')
df['building_size'] = pd.to_numeric(df['building_size'], errors='coerce') df[
= df[(df['property_type'] != 'Lifestyle') & (df['property_type'] != 'Warehouse')] df
'''
not to exclude the missing data, as it could identify which of the columns had more null values,
but other than that, I used Pandas' pivot_table to identify each attribute by property type,
the index of the property_type column, is linked to the number of rooms existing in them,
each column below represents the number of bathrooms, parking, land size and building
'''
=['property_type','bedroom_count'],
pd.pivot_table(df,index=['bathroom_count','parking_count','building_size','land_size','preferred_size']).style.background_gradient(axis=0) values
bathroom_count | building_size | land_size | parking_count | preferred_size | ||
---|---|---|---|---|---|---|
property_type | bedroom_count | |||||
Acreage | 2.000000 | 2.000000 | nan | 2.000000 | 5.000000 | 2.000000 |
3.000000 | 1.000000 | nan | 7.685000 | 1.000000 | 7.685000 | |
4.000000 | 2.666667 | 260.000000 | 7.140000 | 5.666667 | 7.140000 | |
5.000000 | 2.000000 | nan | 9.710000 | 4.000000 | 9.710000 | |
6.000000 | 2.000000 | nan | 2.000000 | 3.000000 | 2.000000 | |
Apartment | 1.000000 | 1.000000 | 67.466667 | 93.333333 | 0.866667 | 67.466667 |
2.000000 | 1.835294 | 122.455556 | 126.158824 | 1.411765 | 122.455556 | |
3.000000 | 1.986301 | 209.851818 | 191.862500 | 1.931507 | 209.851818 | |
4.000000 | 2.125000 | nan | 252.400000 | 2.000000 | nan | |
5.000000 | 3.000000 | 415.000000 | nan | 2.000000 | 415.000000 | |
Block Of Units | 1.000000 | 1.000000 | nan | nan | 1.000000 | nan |
2.000000 | 1.000000 | nan | nan | 1.000000 | nan | |
3.000000 | 2.000000 | 377.440000 | 807.000000 | 2.000000 | 377.440000 | |
6.000000 | 4.000000 | nan | nan | 4.000000 | nan | |
8.000000 | 4.000000 | nan | 812.000000 | 2.000000 | 812.000000 | |
Duplex/Semi-detached | 2.000000 | 1.000000 | nan | nan | 1.000000 | nan |
3.000000 | 1.846154 | nan | 352.600000 | 2.307692 | 352.600000 | |
4.000000 | 2.000000 | 302.333333 | 346.000000 | 2.000000 | 302.333333 | |
House | 1.000000 | 1.000000 | 81.000000 | 332.000000 | 1.000000 | 332.000000 |
2.000000 | 1.434783 | 135.333333 | 329.383333 | 1.565217 | 294.586364 | |
3.000000 | 1.519337 | 197.369821 | 643.007143 | 2.497238 | 636.977407 | |
4.000000 | 2.062500 | 246.642885 | 657.429683 | 2.750000 | 657.429683 | |
5.000000 | 2.500000 | 274.071429 | 608.618462 | 4.136364 | 608.618462 | |
6.000000 | 3.000000 | 281.500000 | 486.678333 | 4.900000 | 486.678333 | |
7.000000 | 5.000000 | 815.000000 | nan | 10.000000 | nan | |
9.000000 | 4.000000 | nan | 837.000000 | 6.000000 | 837.000000 | |
Other | 0.000000 | 1.000000 | nan | nan | 0.000000 | nan |
2.000000 | 1.000000 | nan | nan | 1.000000 | nan | |
3.000000 | 2.000000 | 167.000000 | nan | 5.000000 | nan | |
Studio | 0.000000 | 1.000000 | 55.000000 | nan | 1.000000 | 55.000000 |
1.000000 | 1.000000 | 52.000000 | nan | 1.000000 | 52.000000 | |
Townhouse | 2.000000 | 1.090909 | 90.000000 | 45.000000 | 1.000000 | 75.000000 |
3.000000 | 2.080000 | 103.000000 | 342.750000 | 2.160000 | 335.937500 | |
4.000000 | 2.500000 | 251.000000 | 357.000000 | 2.500000 | 251.000000 | |
Unit | 0.000000 | 1.000000 | 12.000000 | nan | 1.000000 | 12.000000 |
1.000000 | 1.000000 | 63.823529 | 76.600000 | 0.886792 | 70.000000 | |
2.000000 | 1.451613 | 118.424242 | 164.829787 | 1.548387 | 151.436620 | |
3.000000 | 1.980392 | 208.000000 | 251.000000 | 2.078431 | 218.857143 | |
4.000000 | 2.000000 | 356.000000 | 356.000000 | 1.000000 | 356.000000 | |
Villa | 2.000000 | 1.000000 | 80.000000 | 208.500000 | 1.500000 | 157.500000 |
3.000000 | 2.000000 | nan | 331.000000 | 3.000000 | 331.000000 |
'''
here we have a table with the sum of bedrooms, bathrooms and parking in each property by classification
Colors indicate the maximum number of attributes a property contains in the data.
'''
=['product_depth','property_type'],
pd.pivot_table(df,index=['bedroom_count','bathroom_count','parking_count'],aggfunc='sum').style.background_gradient(axis=0) values
bathroom_count | bedroom_count | parking_count | ||
---|---|---|---|---|
product_depth | property_type | |||
feature | Apartment | 106.000000 | 137.000000 | 83.000000 |
Block Of Units | 6.000000 | 11.000000 | 2.000000 | |
Duplex/Semi-detached | 3.000000 | 6.000000 | 4.000000 | |
House | 105.000000 | 209.000000 | 146.000000 | |
Townhouse | 4.000000 | 7.000000 | 3.000000 | |
Unit | 65.000000 | 85.000000 | 63.000000 | |
midtier | Apartment | 4.000000 | 4.000000 | 4.000000 |
House | 6.000000 | 14.000000 | 6.000000 | |
Residential Land | 0.000000 | 0.000000 | 0.000000 | |
Unit | 1.000000 | 2.000000 | 1.000000 | |
premiere | Acreage | 9.000000 | 17.000000 | 19.000000 |
Apartment | 178.000000 | 234.000000 | 167.000000 | |
Block Of Units | 9.000000 | 15.000000 | 9.000000 | |
Duplex/Semi-detached | 23.000000 | 38.000000 | 28.000000 | |
House | 617.000000 | 1194.000000 | 920.000000 | |
Other | 1.000000 | 2.000000 | 1.000000 | |
Residential Land | 0.000000 | 0.000000 | 0.000000 | |
Studio | 1.000000 | 1.000000 | 1.000000 | |
Townhouse | 59.000000 | 88.000000 | 61.000000 | |
Unit | 209.000000 | 291.000000 | 223.000000 | |
Villa | 6.000000 | 10.000000 | 9.000000 | |
standard | Acreage | 9.000000 | 16.000000 | 17.000000 |
Apartment | 78.000000 | 96.000000 | 64.000000 | |
Block Of Units | 1.000000 | 2.000000 | 1.000000 | |
Duplex/Semi-detached | 8.000000 | 15.000000 | 8.000000 | |
House | 97.000000 | 177.000000 | 152.000000 | |
Other | 5.000000 | 6.000000 | 10.000000 | |
Residential Land | 0.000000 | 0.000000 | 0.000000 | |
Studio | 1.000000 | 0.000000 | 1.000000 | |
Townhouse | 6.000000 | 10.000000 | 6.000000 | |
Unit | 62.000000 | 80.000000 | 60.000000 |
'property_type'].value_counts().plot.pie(autopct='%.2f',radius=3, textprops={'size':14},
df[=(0,0,0,0,0,0,2,3,4,5,6), colors=color)
explode'off')
plt.axis( plt.show()
'parking_count'].value_counts().plot.pie(autopct='%.2f',radius=3,explode=(0,0,0,0,0,0,0,0.1,1,2,3),
df[=color, textprops={'size':16})
colors'off')
plt.axis( plt.show()
=(14,24))
plt.figure(figsize= sns.barplot(y=df['property_type'],x=df['price'], hue=df['product_depth'],ci=None, palette=color)
ax =16)
plt.yticks(fontsizeNone)
plt.ylabel(=14)
plt.legend(fontsizefor i in ax.containers:
=16,fmt='%d')
ax.bar_label(i, fontsize='plain',axis='x')
plt.ticklabel_format(style plt.show()
= df['property_type'].unique()
data =(14,47))
plt.figure(figsizefor i,col in enumerate(data):
= plt.subplot(13,1,i + 1)
ax ='land_size',y='price', data=df[df['property_type']==col], hue=df['product_depth'],ci=None)
sns.lineplot(xf'{col}: Price | Product Depth', fontsize=14, fontweight='bold')
plt.title(=14,loc=2, bbox_to_anchor=(1.05,1))
plt.legend(fontsize='plain')
plt.ticklabel_format(style
plt.tight_layout() plt.show()
= ['land_size','preferred_size']
colunas = df['property_type'].unique()
data
=(14,47))
plt.figure(figsizefor i,col in enumerate(data):
for d in colunas:
= plt.subplot(13,1,i + 1)
ax =d,y='price', data=df[df['property_type']==col], label=f'{d}',ci=None)
sns.lineplot(xf'{col}: Price | Property Type', fontsize=14, fontweight='bold')
plt.title(None)
plt.xlabel(=14)
plt.legend(fontsize='plain')
plt.ticklabel_format(style
plt.tight_layout() plt.show()
= ['land_size','building_size']
colunas = df['property_type'].unique()
data
=(14,47))
plt.figure(figsizefor i,col in enumerate(data):
for d in colunas:
= plt.subplot(13,1,i + 1)
ax =d,y='price', data=df[df['property_type']==col], label=f'{d}',ci=None)
sns.lineplot(xf'{col}: Price | Property Type',fontsize=14, fontweight='bold')
plt.title(None)
plt.xlabel(=14)
plt.legend(fontsize='plain')
plt.ticklabel_format(style
plt.tight_layout() plt.show()
=(14,7))
plt.figure(figsize= sns.barplot(y=df['product_depth'],x=df['price'],ci=None,palette=color)
ax None)
plt.ylabel(=14)
plt.yticks(fontsize='plain',axis='x')
plt.ticklabel_format(stylefor i in ax.containers:
=14) ax.bar_label(i, fontsize
=(14,7))
plt.figure(figsize=df['property_type'],palette=color)
sns.countplot(y=14)
plt.yticks(fontsizeNone)
plt.ylabel( plt.show()
=(14,7))
plt.figure(figsize= sns.barplot(y=df['property_type'],x=df['price'],ci=None,palette=color)
ax =14)
plt.yticks(fontsizeNone)
plt.ylabel(='plain',axis='x')
plt.ticklabel_format(stylefor i in ax.containers:
=16,fmt='%d') ax.bar_label(i, fontsize
=(14,20))
plt.figure(figsize=df['city'],order=df['city'].value_counts().index,palette=color)
sns.countplot(y=14)
plt.yticks(fontsizeNone)
plt.ylabel( plt.show()
'price'] == 38900000.0].T df[df[
140 | |
---|---|
TID | 1351128 |
category_name | REAL ESTATE & PROPERTY FOR SALE IN DARWIN CITY... |
property_type | Apartment |
building_size | NaN |
land_size | NaN |
preferred_size | NaN |
open_date | Under offer |
listing_agency | PRD DARWIN - DARWIN CITY |
price | 38900000.0 |
location_number | 136568730 |
address | 3/24 Harvey Street, Darwin City, NT 0800 |
city | DARWIN CITY |
zip_code | 800 |
phone | 0421073034 |
product_depth | standard |
bedroom_count | 2.0 |
bathroom_count | 2.0 |
parking_count | 2.0 |
agency_names | PRD DARWIN |
agency_names2 | DARWINCITY |
priceConsidered | None |
priceCondition | None |
=(14,20))
plt.figure(figsize=df['agency_names'],order=df['agency_names'].value_counts().index,palette=color)
sns.countplot(y=12)
plt.yticks(fontsizeNone)
plt.ylabel( plt.show()
=(14,20))
plt.figure(figsize=df['agency_names2'], order=df['agency_names2'].value_counts().index,palette=color)
sns.countplot(y=12)
plt.yticks(fontsizeNone)
plt.ylabel( plt.show()
=True) df.dropna(inplace
from geopy.geocoders import Nominatim
from geopy.extra.rate_limiter import RateLimiter
= Nominatim(user_agent = 'Busca')
geolocator
= RateLimiter(geolocator.geocode,min_delay_seconds=1)
geocode
'location'] = df['city'].apply(geocode)
df[
'lat'] = df['location'].apply(lambda loc: str(loc.latitude) if loc else None)
df[
'long'] = df['location'].apply(lambda loc: str(loc.longitude) if loc else None) df[
'lat'] = df['lat'].astype(float)
df['long'] = df['long'].astype(float) df[