c681c4b475b54ed08adab7f03d2a414a

import numpy as np 
import pandas as pd 
import seaborn as sns
import matplotlib.pyplot as plt

color = [
    '#ADCE74',
    '#A2738C',
    '#82DBD8',
    '#D3C09A',
    '#EC8F6A',
    '#6BBA62',
    '#F3D516',
    '#FFCB3C',
    '#FF677D',
    '#EADADA',
    '#999B84'
]

import os

df = pd.read_csv('RealEstateAU_1000_Samples.csv')

print(df.shape)
print(df.duplicated().sum())
tabela = pd.DataFrame({
    'Unique':df.nunique(),
    'Null':df.isna().sum(),
    'NullPercent':df.isna().sum() / len(df),
    'Types':df.dtypes.values
})
display(tabela)

(1000, 27)
0

	Unique	Null	NullPercent	Types
index	1000	0	0.000	int64
TID	1000	0	0.000	int64
breadcrumb	2	0	0.000	object
category_name	2	0	0.000	object
property_type	13	0	0.000	object
building_size	169	720	0.720	object
land_size	346	467	0.467	object
preferred_size	376	391	0.391	object
open_date	15	698	0.698	object
listing_agency	85	0	0.000	object
price	494	0	0.000	object
location_number	889	0	0.000	int64
location_type	1	0	0.000	object
location_name	494	0	0.000	object
address	882	12	0.012	object
address_1	882	12	0.012	object
city	56	0	0.000	object
state	1	0	0.000	object
zip_code	13	0	0.000	int64
phone	84	0	0.000	object
latitude	0	1000	1.000	float64
longitude	0	1000	1.000	float64
product_depth	4	0	0.000	object
bedroom_count	10	33	0.033	float64
bathroom_count	5	33	0.033	float64
parking_count	11	33	0.033	float64
RunDate	1	0	0.000	object

df.drop(['index','latitude','longitude','state','location_type','location_name','address_1','breadcrumb','RunDate'],axis=1,inplace=True)

colunas = ['category_name','listing_agency','city']
for i in colunas:
    df[i] = df[i].str.upper()

colunas = ['address']
for col in colunas:
    df[col] = df[col].fillna('Unknow')

colunas = ['building_size','preferred_size','land_size']
for col in colunas:
    df[col] = df[col].str.replace('m²','').str.replace('ha','')
    
print('ok')

ok

colunas = df['listing_agency'].str.split('-',1,expand=True)
df['agency_names'] = colunas[0]
df['agency_names2'] = colunas[1]
df['agency_names2'] = df['agency_names2'].str.replace(' ','')
df['agency_names2'] = df['agency_names2'].replace('   ','Unknown')
df['agency_names2'] = df['agency_names2'].replace( '    ','Unknown')
# espaçamento existente na linha
df['agency_names'] = df['agency_names'].replace( 'FOR SALE BY OWNER                                                                                   ','FOR SALE BY OWNER')

df['price'] = df['price'].replace('$1.15m','$1.150000')
df['price'] = df['price'].replace('JUST LIKE THAT: UNDER CONTRACT IN 7 DAYS','JUST LIKE THAT: UNDER CONTRACT IN SEVEN DAYS')
df['price'] = df['price'].replace('Offers over $1.2m','Offers over $1.200000')
df['price'] = df['price'].replace('Auction Wednesday 1st of June 2022','Auction')
df['price'] = df['price'].replace('Auction 8th June on site','Auction')
df['price'] = df['price'].replace('Auction - Wednesday 15th June 2022 at 5.30pm','Auction')
df['price'] = df['price'].replace('AUCTION: Saturday 4th Jun @11am On-Site',"AUCTION")
df['price'] = df['price'].replace('JUST LIKE THAT: UNDER CONTRACT IN 5 DAYS','JUST LIKE THAT: UNDER CONTRACT IN FIVE DAYS')
df['price'] = df['price'].replace('Offers Over $500,000 - Offers by 6.30pm 22/6/22','Offers Over $500,000')
df['price'] = df['price'].replace('Negotiable Above 1.5M','Negotiable Above 1.500000')
df['price'] = df['price'].replace('$147 000','$147,000')
df['price'] = df['price'].replace('$369 000','$369,000')
df['price'] = df['price'].replace('$545,000 ( over 1000sqm of land)','$545,000 ( over thousand sqm of land)')
df['price'] = df['price'].replace('$450 000','$450,000')

df['property_type'].unique()

array(['House', 'Apartment', 'Unit', 'Studio', 'Residential Land',
       'Block Of Units', 'Townhouse', 'Acreage', 'Duplex/Semi-detached',
       'Other', 'Villa', 'Warehouse', 'Lifestyle'], dtype=object)

colunas = df['price'].str.split('-',1,expand=True)
df['price'] = colunas[0]
df['priceConsidered'] = colunas[1]

colunas = df['price'].str.split(' ',1,expand=True)
df['price'] = colunas[0]
df['priceConsidered'] = colunas[1]

df['priceCondition'] = df['priceConsidered'].replace(r'[0-9]','',regex=True)
df['priceCondition'] = df['priceCondition'].str.replace('$','',regex=True)
df['priceCondition'] = df['priceCondition'].str.replace('!','',regex=True).str.replace(' ,','',regex=True)
df['priceCondition'] = df['priceCondition'].str.upper()
# removendo letras
df['priceConsidered'] = df['priceConsidered'].replace('[^0-9]','',regex=True)

# removendo letras
df['price'] = df['price'].replace('[^0-9]','',regex=True)
# removendo símbolo
df['price'] = df['price'].replace('$','',regex=True).replace('-$','',regex=True)

df['price'] = df['price'].replace('',np.nan)
df['price'] = df['price'].fillna(df['priceConsidered'])
df['price'] = pd.to_numeric(df['price'], errors='coerce')

df['land_size'] = pd.to_numeric(df['land_size'], errors='coerce')
df['preferred_size'] = pd.to_numeric(df['preferred_size'], errors='coerce')
df['building_size'] = pd.to_numeric(df['building_size'], errors='coerce')

df =  df[(df['property_type'] != 'Lifestyle') & (df['property_type'] != 'Warehouse')]

'''
not to exclude the missing data, as it could identify which of the columns had more null values,
but other than that, I used Pandas' pivot_table to identify each attribute by property type,
the index of the property_type column, is linked to the number of rooms existing in them, 
each column below represents the number of bathrooms, parking, land size and building
'''
pd.pivot_table(df,index=['property_type','bedroom_count'],
            values=['bathroom_count','parking_count','building_size','land_size','preferred_size']).style.background_gradient(axis=0)

		bathroom_count	building_size	land_size	parking_count	preferred_size
property_type	bedroom_count
Acreage	2.000000	2.000000	nan	2.000000	5.000000	2.000000
	3.000000	1.000000	nan	7.685000	1.000000	7.685000
	4.000000	2.666667	260.000000	7.140000	5.666667	7.140000
	5.000000	2.000000	nan	9.710000	4.000000	9.710000
	6.000000	2.000000	nan	2.000000	3.000000	2.000000
Apartment	1.000000	1.000000	67.466667	93.333333	0.866667	67.466667
	2.000000	1.835294	122.455556	126.158824	1.411765	122.455556
	3.000000	1.986301	209.851818	191.862500	1.931507	209.851818
	4.000000	2.125000	nan	252.400000	2.000000	nan
	5.000000	3.000000	415.000000	nan	2.000000	415.000000
Block Of Units	1.000000	1.000000	nan	nan	1.000000	nan
	2.000000	1.000000	nan	nan	1.000000	nan
	3.000000	2.000000	377.440000	807.000000	2.000000	377.440000
	6.000000	4.000000	nan	nan	4.000000	nan
	8.000000	4.000000	nan	812.000000	2.000000	812.000000
Duplex/Semi-detached	2.000000	1.000000	nan	nan	1.000000	nan
	3.000000	1.846154	nan	352.600000	2.307692	352.600000
	4.000000	2.000000	302.333333	346.000000	2.000000	302.333333
House	1.000000	1.000000	81.000000	332.000000	1.000000	332.000000
	2.000000	1.434783	135.333333	329.383333	1.565217	294.586364
	3.000000	1.519337	197.369821	643.007143	2.497238	636.977407
	4.000000	2.062500	246.642885	657.429683	2.750000	657.429683
	5.000000	2.500000	274.071429	608.618462	4.136364	608.618462
	6.000000	3.000000	281.500000	486.678333	4.900000	486.678333
	7.000000	5.000000	815.000000	nan	10.000000	nan
	9.000000	4.000000	nan	837.000000	6.000000	837.000000
Other	0.000000	1.000000	nan	nan	0.000000	nan
	2.000000	1.000000	nan	nan	1.000000	nan
	3.000000	2.000000	167.000000	nan	5.000000	nan
Studio	0.000000	1.000000	55.000000	nan	1.000000	55.000000
Studio	1.000000	1.000000	52.000000	nan	1.000000	52.000000
Townhouse	2.000000	1.090909	90.000000	45.000000	1.000000	75.000000
	3.000000	2.080000	103.000000	342.750000	2.160000	335.937500
	4.000000	2.500000	251.000000	357.000000	2.500000	251.000000
Unit	0.000000	1.000000	12.000000	nan	1.000000	12.000000
	1.000000	1.000000	63.823529	76.600000	0.886792	70.000000
	2.000000	1.451613	118.424242	164.829787	1.548387	151.436620
	3.000000	1.980392	208.000000	251.000000	2.078431	218.857143
	4.000000	2.000000	356.000000	356.000000	1.000000	356.000000
Villa	2.000000	1.000000	80.000000	208.500000	1.500000	157.500000
Villa	3.000000	2.000000	nan	331.000000	3.000000	331.000000

'''
here we have a table with the sum of bedrooms, bathrooms and parking in each property by classification
Colors indicate the maximum number of attributes a property contains in the data.
'''
pd.pivot_table(df,index=['product_depth','property_type'],
            values=['bedroom_count','bathroom_count','parking_count'],aggfunc='sum').style.background_gradient(axis=0)

		bathroom_count	bedroom_count	parking_count
product_depth	property_type
feature	Apartment	106.000000	137.000000	83.000000
	Block Of Units	6.000000	11.000000	2.000000
	Duplex/Semi-detached	3.000000	6.000000	4.000000
	House	105.000000	209.000000	146.000000
	Townhouse	4.000000	7.000000	3.000000
	Unit	65.000000	85.000000	63.000000
midtier	Apartment	4.000000	4.000000	4.000000
	House	6.000000	14.000000	6.000000
	Residential Land	0.000000	0.000000	0.000000
	Unit	1.000000	2.000000	1.000000
premiere	Acreage	9.000000	17.000000	19.000000
	Apartment	178.000000	234.000000	167.000000
	Block Of Units	9.000000	15.000000	9.000000
	Duplex/Semi-detached	23.000000	38.000000	28.000000
	House	617.000000	1194.000000	920.000000
	Other	1.000000	2.000000	1.000000
	Residential Land	0.000000	0.000000	0.000000
	Studio	1.000000	1.000000	1.000000
	Townhouse	59.000000	88.000000	61.000000
	Unit	209.000000	291.000000	223.000000
	Villa	6.000000	10.000000	9.000000
standard	Acreage	9.000000	16.000000	17.000000
	Apartment	78.000000	96.000000	64.000000
	Block Of Units	1.000000	2.000000	1.000000
	Duplex/Semi-detached	8.000000	15.000000	8.000000
	House	97.000000	177.000000	152.000000
	Other	5.000000	6.000000	10.000000
	Residential Land	0.000000	0.000000	0.000000
	Studio	1.000000	0.000000	1.000000
	Townhouse	6.000000	10.000000	6.000000
	Unit	62.000000	80.000000	60.000000

df['property_type'].value_counts().plot.pie(autopct='%.2f',radius=3, textprops={'size':14},
                                            explode=(0,0,0,0,0,0,2,3,4,5,6), colors=color)
plt.axis('off')
plt.show()

df['parking_count'].value_counts().plot.pie(autopct='%.2f',radius=3,explode=(0,0,0,0,0,0,0,0.1,1,2,3),
                                            colors=color, textprops={'size':16})
plt.axis('off')
plt.show()

plt.figure(figsize=(14,24))
ax = sns.barplot(y=df['property_type'],x=df['price'], hue=df['product_depth'],ci=None, palette=color)
plt.yticks(fontsize=16)
plt.ylabel(None)
plt.legend(fontsize=14)
for i in ax.containers:
    ax.bar_label(i, fontsize=16,fmt='%d')
plt.ticklabel_format(style='plain',axis='x')
plt.show()

data = df['property_type'].unique()
plt.figure(figsize=(14,47))
for i,col in enumerate(data):
    ax = plt.subplot(13,1,i + 1)
    sns.lineplot(x='land_size',y='price', data=df[df['property_type']==col], hue=df['product_depth'],ci=None)
    plt.title(f'{col}: Price | Product Depth', fontsize=14, fontweight='bold')
    plt.legend(fontsize=14,loc=2, bbox_to_anchor=(1.05,1))
    plt.ticklabel_format(style='plain')
plt.tight_layout()
plt.show()

colunas = ['land_size','preferred_size']
data = df['property_type'].unique()

plt.figure(figsize=(14,47))
for i,col in enumerate(data):
    for d in colunas:
        ax = plt.subplot(13,1,i + 1)
        sns.lineplot(x=d,y='price', data=df[df['property_type']==col], label=f'{d}',ci=None)
    plt.title(f'{col}: Price | Property Type', fontsize=14, fontweight='bold')
    plt.xlabel(None)
    plt.legend(fontsize=14)
    plt.ticklabel_format(style='plain')
plt.tight_layout()
plt.show()

colunas = ['land_size','building_size']
data = df['property_type'].unique()

plt.figure(figsize=(14,47))
for i,col in enumerate(data):
    for d in colunas:
        ax = plt.subplot(13,1,i + 1)
        sns.lineplot(x=d,y='price', data=df[df['property_type']==col], label=f'{d}',ci=None)
    plt.title(f'{col}: Price | Property Type',fontsize=14, fontweight='bold')
    plt.xlabel(None)
    plt.legend(fontsize=14)
    plt.ticklabel_format(style='plain')
plt.tight_layout()
plt.show()

plt.figure(figsize=(14,7))
ax = sns.barplot(y=df['product_depth'],x=df['price'],ci=None,palette=color)
plt.ylabel(None)
plt.yticks(fontsize=14)
plt.ticklabel_format(style='plain',axis='x')
for i in ax.containers:
    ax.bar_label(i, fontsize=14)

plt.figure(figsize=(14,7))
sns.countplot(y=df['property_type'],palette=color)
plt.yticks(fontsize=14)
plt.ylabel(None)
plt.show()


plt.figure(figsize=(14,7))
ax = sns.barplot(y=df['property_type'],x=df['price'],ci=None,palette=color)
plt.yticks(fontsize=14)
plt.ylabel(None)
plt.ticklabel_format(style='plain',axis='x')
for i in ax.containers:
    ax.bar_label(i, fontsize=16,fmt='%d')

plt.figure(figsize=(14,20))
sns.countplot(y=df['city'],order=df['city'].value_counts().index,palette=color)
plt.yticks(fontsize=14)
plt.ylabel(None)
plt.show()

df[df['price'] == 38900000.0].T

	140
TID	1351128
category_name	REAL ESTATE & PROPERTY FOR SALE IN DARWIN CITY...
property_type	Apartment
building_size	NaN
land_size	NaN
preferred_size	NaN
open_date	Under offer
listing_agency	PRD DARWIN - DARWIN CITY
price	38900000.0
location_number	136568730
address	3/24 Harvey Street, Darwin City, NT 0800
city	DARWIN CITY
zip_code	800
phone	0421073034
product_depth	standard
bedroom_count	2.0
bathroom_count	2.0
parking_count	2.0
agency_names	PRD DARWIN
agency_names2	DARWINCITY
priceConsidered	None
priceCondition	None

plt.figure(figsize=(14,20))
sns.countplot(y=df['agency_names'],order=df['agency_names'].value_counts().index,palette=color)
plt.yticks(fontsize=12)
plt.ylabel(None)
plt.show()

plt.figure(figsize=(14,20))
sns.countplot(y=df['agency_names2'], order=df['agency_names2'].value_counts().index,palette=color)
plt.yticks(fontsize=12)
plt.ylabel(None)
plt.show()

df.dropna(inplace=True)

from geopy.geocoders import Nominatim
from geopy.extra.rate_limiter import RateLimiter

geolocator = Nominatim(user_agent = 'Busca')

geocode = RateLimiter(geolocator.geocode,min_delay_seconds=1)

df['location'] = df['city'].apply(geocode)

df['lat'] = df['location'].apply(lambda loc: str(loc.latitude) if loc else None)

df['long'] = df['location'].apply(lambda loc: str(loc.longitude) if loc else None)

df['lat'] = df['lat'].astype(float)
df['long'] = df['long'].astype(float)