import numpy as np
import pandas as pd
import os
import warnings
warnings.filterwarnings('ignore')
import matplotlib.pyplot as plt
%matplotlib inline
import seaborn as sns
from sklearn import preprocessing
from sklearn.preprocessing import StandardScaler
from sklearn.preprocessing import MinMaxScaler
from sklearn.metrics import mean_squared_error
from sklearn.model_selection import train_test_split
from sklearn.ensemble import GradientBoostingRegressor
from sklearn.tree import DecisionTreeClassifier
from sklearn.pipeline import Pipeline


path = 'C:\\Users\\mahit\\OneDrive\\Desktop\\DSPP\\ML & DL\\Assignments\\JNTUH_ML_DL_assignment_2\\JNTUH ML DL assignment 2\\'


df = pd.read_csv(path+'RealEstateAU_1000_Samples.csv')
df


df.head()


df.tail()


df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1000 entries, 0 to 999
Data columns (total 27 columns):
 #   Column           Non-Null Count  Dtype  
---  ------           --------------  -----  
 0   index            1000 non-null   int64  
 1   TID              1000 non-null   int64  
 2   breadcrumb       1000 non-null   object 
 3   category_name    1000 non-null   object 
 4   property_type    1000 non-null   object 
 5   building_size    280 non-null    object 
 6   land_size        533 non-null    object 
 7   preferred_size   609 non-null    object 
 8   open_date        302 non-null    object 
 9   listing_agency   1000 non-null   object 
 10  price            1000 non-null   object 
 11  location_number  1000 non-null   int64  
 12  location_type    1000 non-null   object 
 13  location_name    1000 non-null   object 
 14  address          988 non-null    object 
 15  address_1        988 non-null    object 
 16  city             1000 non-null   object 
 17  state            1000 non-null   object 
 18  zip_code         1000 non-null   int64  
 19  phone            1000 non-null   object 
 20  latitude         0 non-null      float64
 21  longitude        0 non-null      float64
 22  product_depth    1000 non-null   object 
 23  bedroom_count    967 non-null    float64
 24  bathroom_count   967 non-null    float64
 25  parking_count    967 non-null    float64
 26  RunDate          1000 non-null   object 
dtypes: float64(5), int64(4), object(18)
memory usage: 211.1+ KB


df.isnull().sum()

index                 0
TID                   0
breadcrumb            0
category_name         0
property_type         0
building_size       720
land_size           467
preferred_size      391
open_date           698
listing_agency        0
price                 0
location_number       0
location_type         0
location_name         0
address              12
address_1            12
city                  0
state                 0
zip_code              0
phone                 0
latitude           1000
longitude          1000
product_depth         0
bedroom_count        33
bathroom_count       33
parking_count        33
RunDate               0
dtype: int64


df = df.drop(['latitude'], axis = 1)
df = df.drop(['longitude'], axis = 1)
df = df.drop(['index'], axis = 1)
df = df.drop(['address'], axis = 1)
df = df.drop(['address_1'], axis = 1)


# Data Encoding
label_encoder = preprocessing.LabelEncoder()
df['building_size']= label_encoder.fit_transform(df['building_size'])
df['building_size'].unique()

array([169,  41,  69,  80,  45,  92,  49, 150, 146,  21,  32,  10, 128,
       139, 138,  23, 155,  91,  17, 163, 141,  65, 127, 106, 149, 157,
         8,  29,  30, 153,  68,  42, 140,  58,  89,  78,  51,  54,   9,
       168, 136, 162,  33,  43, 143, 142, 145, 151, 167, 152,  60,  16,
       107, 101,  38,  77, 135,  44, 117,  95,  20, 120,  88, 110,  94,
        48,  55, 123, 124, 147, 148,  96, 125, 131,  84,  52,  37,  46,
       115,  67,  59,  86,  76,  57,  34,  24, 113,  63, 119,  99,  12,
        13,  15,  27,  75,   7, 116,  22, 156,  90,  25,  35, 165, 109,
        56, 105, 121,   1, 130,  40, 164,   6,  39,  19,  26,  66,  47,
       166,  87,  14, 137, 134,  11,  93,   4,  62,  61, 144,  71, 114,
         3, 111, 100,  72, 103, 104,  53, 133, 129,  73,  97,  82,  81,
        31,  36,  98, 126, 108, 122, 154,  50,  64, 160,   2, 112, 159,
       102,   5,  85,  28,  83,  79,   0, 132,  18, 118, 161, 158,  70,
        74])


label_encoder1 = preprocessing.LabelEncoder()
df['preferred_size']= label_encoder1.fit_transform(df['preferred_size'])
df['preferred_size'].unique()

array([376,  57,  90,  99,  22,  60, 104,  63, 243, 216,  42,  48,  26,
       200, 141, 194, 186,  35,  43, 303, 103,  38, 358, 263, 197,  75,
       140, 111,  47, 236, 320,  24,  45,  46, 106, 293, 193,  89,  58,
        33,  93,  70, 201, 102,  96,  41,  64,  67,  25, 375, 165, 357,
        49,  59, 203, 206, 270, 373, 276,  71,  36, 114, 108,  54,  95,
       359, 342,   1, 312, 307,  82, 128, 156, 309, 299, 285, 113,  88,
       176, 279, 175, 192, 344, 305, 353,  98,  20, 284, 374, 142, 187,
       339, 302, 347, 112, 196, 237, 365, 300, 143, 311, 167, 228,  32,
       232, 330,  92, 134, 214,  61, 268, 116, 319, 291, 205, 122, 124,
       328, 278, 125, 173, 177,  65,  53, 239, 301, 188, 184, 297,  17,
       290, 295, 246, 226, 333, 304, 159,  86, 345, 224, 127, 217, 250,
         6, 321, 313, 121, 153, 288, 371, 107,   8,  81,  31, 110, 242,
       181, 129, 322,  83,  77, 294, 118, 372, 314, 308, 212, 332, 227,
        30, 119, 351, 368, 267, 362, 230, 275, 346, 360,  66, 168, 286,
       211,  15, 331,  23,   9, 145,   4, 219,  28, 280, 272, 317, 282,
       202, 261,  80,  51, 251, 363, 223,   3, 310, 296, 283, 229, 273,
       245, 130, 178, 213, 244,  62, 105, 208, 148, 161, 287, 340,   0,
        14, 146, 199, 318, 151, 101, 182, 361,  21,  55, 231, 162,  37,
       215, 323,  40, 147, 264, 189, 164, 240, 117,  16,  73,  76, 123,
       256,  94, 191, 367, 138, 172, 221,  13, 369, 180,  78,  29, 163,
       334, 356, 306,  44,  87, 336,   5, 298,  79, 262,  27, 269, 325,
       241, 252, 234, 265, 274, 315, 354, 259, 370,  72, 348,  85, 174,
       260, 155, 255, 326, 238, 149, 179, 170,   2, 183, 281, 249,  11,
       100, 337, 254, 222,   7, 109, 218, 137, 277, 258,  19, 233, 355,
        10, 235, 154, 324, 349, 166, 366, 364, 139, 169, 257,  97, 247,
       198, 209, 271,  84, 292, 158, 185, 204, 289,  52, 253,  12, 157,
       195, 115, 225,  91, 171, 335, 210, 150, 190,  74, 136, 152, 343,
       144, 338, 120, 329,  69, 341, 316,  68, 220, 266, 248, 133, 160,
       132,  18, 126,  34, 135,  39,  56, 131, 352, 327,  50, 350, 207])


label_encoder2 = preprocessing.LabelEncoder()
df['land_size']= label_encoder2.fit_transform(df['land_size'])
df['land_size'].unique()

array([346,  20, 332, 183, 124,  85,  38,  33,  87,  51, 281,  86,  56,
        93,  97, 242, 180,  18,  28,  41,  82,  42,  40, 215,  88, 331,
        26, 177,  91,  31,  75,  44,  89, 184,  36,  37,  76, 249, 335,
        53,  21,  34,  99,  58,  78, 333, 318,   1, 290, 285,  66, 112,
       140, 287, 277, 263,  96,  72, 161, 257, 160, 176, 319, 283, 327,
        80,  19, 262, 345, 125, 171, 315, 280, 322,  95,  60, 179, 216,
       338, 278, 126, 289, 152, 304,  30, 211, 307,  74, 118, 195,  48,
       247,  98, 297, 269, 187, 106, 108, 305, 256, 109, 158, 162, 218,
       279, 172, 169, 275,  16, 268, 273, 225, 206, 310, 282, 143,  70,
       320, 204, 111, 197, 229,   6, 298, 291, 105, 138, 266, 343,  90,
         8,  65,  29,  92, 221, 166, 113, 299,  67,  59, 272, 101, 344,
       292, 286, 193, 309, 207,  94,  84, 103, 326, 341, 246, 336, 209,
       254, 321, 334,  54, 153,  64, 264, 192,  14, 308,   9, 129,   4,
       199,  25, 258, 251, 295, 260, 185, 240,  63, 230, 203,   3, 288,
       274, 261, 208, 252, 224, 114, 163, 194, 223,  49,  47, 189, 132,
       145, 265, 316,   0, 182, 296, 136,  83, 167,  22,  23, 210, 146,
        35, 196, 300, 131, 243, 173, 149, 219, 100,  15,  57, 107, 130,
       235, 147, 175, 340, 122, 157, 201,  45,  13, 165,  61, 102,  27,
       148, 311, 330, 284,  52,  71, 313,   5, 276,  62, 241,  24, 248,
       302, 220, 231, 213, 244, 253, 293, 328, 238, 342, 323,  69, 159,
       239, 139, 234, 303, 217, 134, 164, 155,   2, 168, 259, 228,  11,
        50,  81, 314, 233, 202,   7, 133, 198, 121, 255, 237, 212, 329,
        10, 214, 301, 324, 271, 151,  39, 339, 337, 123, 154, 236,  79,
       226, 222, 181, 190, 250,  68, 270, 150, 142, 170, 186, 267, 232,
        12, 141, 178, 205,  73, 156, 312, 191, 135, 174, 127, 120, 137,
       128,  77, 104, 306,  55, 317, 294, 200, 245, 227, 117, 144, 116,
        17, 110,  32, 119,  46, 115,  43, 325, 188])


label_encoder3 = preprocessing.LabelEncoder()
df['price']= label_encoder3.fit_transform(df['price'])
df['price'].unique()

array([107, 322,  59,  37, 108, 221, 220,  80, 114,  44, 429, 446, 213,
       184, 430, 132,  68, 265, 483, 331, 189, 138,  58,  53, 217, 361,
       335,  35, 157, 380,  94, 127, 255, 291, 122, 258, 362,   0, 484,
       112, 210,  47, 404,  95, 395, 191, 337,  96, 358,  54, 169, 326,
       131, 293, 139,  64, 152,  20,  15,  26, 226,  81,  74, 401,  24,
       101, 140, 149,  67, 469, 285,   6, 345, 124, 143,  28,  31,  60,
       393, 286,  86,  41, 434,  40,   1, 254, 470, 423, 151, 267, 363,
       171, 383, 392, 421,  16, 463, 329, 428, 160, 400, 117,  21, 369,
        52,  83, 281, 399, 110, 146, 398, 288, 314, 490, 287,  32,  90,
        82,  12,  19, 145,  65, 370, 264,  43, 270,   5, 218,  45, 273,
       113, 201, 104, 161,  25,  23, 381, 135, 432, 129, 251, 109,  22,
       493, 106, 235, 231, 479, 236, 302,   3, 351, 457, 253, 252, 238,
       376, 266, 384, 305, 365, 100, 294, 162, 118, 440, 456, 164, 208,
       327, 243, 176, 196, 419, 228, 454, 328, 295, 339, 311, 333, 180,
       174, 181,  27,  36, 103, 214, 224, 425, 165, 317, 125, 488,  79,
       102, 478,  39, 211, 366, 262, 485, 168, 263, 453, 275, 462, 150,
       185,  42, 468, 343, 188, 409, 487,  87, 158, 402, 313,  88, 173,
       397, 405, 330,  10,  51, 133, 227, 247,  33, 471, 413, 408, 244,
       240, 148, 486, 259, 195, 472, 390, 455, 245, 303, 289, 437, 451,
       354, 449, 250, 246,  63, 256, 197, 239,  49, 441, 460, 458, 261,
       368, 203, 379, 410, 202, 232, 134, 260, 477, 130, 198, 248,  69,
       367, 207, 234, 249, 415, 179, 257, 448, 475, 348,   8,  50, 144,
       424, 420, 394,  29, 167,  73,  91, 277, 307, 186,  57, 321, 378,
       418, 452, 225, 241, 411, 492,   4, 422, 461,  76,  77,  84, 414,
       206, 183, 436, 175, 360, 309, 111, 373, 280, 347, 403, 438, 385,
       318, 407,  61,  14,   7,  99, 308, 119,  98, 219, 480, 304,  70,
       283, 153, 464, 274,  85, 142, 382, 299, 199,  30, 156, 310, 447,
       233, 459, 141, 268,  92, 450, 159, 116, 282, 300, 342, 105,  89,
       172,  18,  75, 388, 443, 374, 476, 128, 356, 442, 377, 230, 371,
       276, 284, 137,  11, 426, 312,  72, 372, 417, 177, 353, 346, 205,
       481, 482, 389, 359, 412, 163,  62, 306,  66, 120, 155, 435, 349,
       444, 431, 229, 467,  17, 216, 115, 491, 427, 193, 222, 386, 187,
       315,  48, 136, 364, 396, 323, 324, 297, 340, 121, 242, 301, 209,
       465, 316, 271, 332, 215,  38, 147, 355, 223, 212, 416, 344, 325,
       341, 357, 290, 319, 269, 279, 489, 192, 278, 338, 166, 272,  71,
       466,  78,   2,  97, 126,  46, 352,  93, 182, 190, 200, 298, 320,
       439, 154, 387, 473, 292, 375, 406, 336,  13, 204, 445, 433,  56,
       391,  34, 334, 170,   9, 350, 474, 123,  55, 296, 237, 178, 194])


open__date = {'Under offer':0, 'Added 1 hour ago':1, 'Added 2 hours ago':2, 'Added 4 hours ago':3, 'Added 5 hours ago':4, 'Added 6 hours ago':5, 'Added 9 hours ago':6, 'Added 23 hours ago':7, 'Added yesterday':8, 'Added 2 days ago':9, 'Added 3 days ago':10, 'Added 4 days ago':11, 'Added 6 days ago':12, 'Added 7 hours ago': 13, 'Added 22 hours ago':14}
df['open_date'] = df['open_date'].replace(open__date)


# Data Imputation

df['bedroom_count'].fillna(df['bedroom_count'].median(),inplace = True)
df['bathroom_count'].fillna(df['bathroom_count'].median(),inplace = True)
df['parking_count'].fillna(df['parking_count'].median(),inplace = True)
df['open_date'].fillna(df['open_date'].mean(),inplace = True)
df['building_size'].fillna(df['building_size'].mean(),inplace = True)
df['land_size'].fillna(df['land_size'].mean(),inplace = True)
df['preferred_size'].fillna(df['preferred_size'].mean(),inplace = True)


df.isnull().sum()

TID                0
breadcrumb         0
category_name      0
property_type      0
building_size      0
land_size          0
preferred_size     0
open_date          0
listing_agency     0
price              0
location_number    0
location_type      0
location_name      0
city               0
state              0
zip_code           0
phone              0
product_depth      0
bedroom_count      0
bathroom_count     0
parking_count      0
RunDate            0
dtype: int64


df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1000 entries, 0 to 999
Data columns (total 22 columns):
 #   Column           Non-Null Count  Dtype  
---  ------           --------------  -----  
 0   TID              1000 non-null   int64  
 1   breadcrumb       1000 non-null   object 
 2   category_name    1000 non-null   object 
 3   property_type    1000 non-null   object 
 4   building_size    1000 non-null   int32  
 5   land_size        1000 non-null   int32  
 6   preferred_size   1000 non-null   int32  
 7   open_date        1000 non-null   float64
 8   listing_agency   1000 non-null   object 
 9   price            1000 non-null   int32  
 10  location_number  1000 non-null   int64  
 11  location_type    1000 non-null   object 
 12  location_name    1000 non-null   object 
 13  city             1000 non-null   object 
 14  state            1000 non-null   object 
 15  zip_code         1000 non-null   int64  
 16  phone            1000 non-null   object 
 17  product_depth    1000 non-null   object 
 18  bedroom_count    1000 non-null   float64
 19  bathroom_count   1000 non-null   float64
 20  parking_count    1000 non-null   float64
 21  RunDate          1000 non-null   object 
dtypes: float64(4), int32(4), int64(3), object(11)
memory usage: 156.4+ KB


categorical = df.select_dtypes(include = "object").columns
for i in categorical:
    print(df[categorical][i].value_counts())
    print('________________________________________')

Buy>NT>DARWIN         816
Buy>NT>DARWIN CITY    184
Name: breadcrumb, dtype: int64
________________________________________
Real Estate & Property for sale in DARWIN, NT 0801         816
Real Estate & Property for sale in DARWIN CITY, NT 0800    184
Name: category_name, dtype: int64
________________________________________
House                   441
Unit                    230
Apartment               212
Townhouse                38
Residential Land         33
Duplex/Semi-detached     19
Acreage                   9
Block Of Units            6
Other                     4
Villa                     4
Studio                    2
Warehouse                 1
Lifestyle                 1
Name: property_type, dtype: int64
________________________________________
Real Estate Central - DARWIN CITY                  113
Elders Real Estate - Darwin                         62
Elders Real Estate - Palmerston                     53
Raine & Horne - Darwin                              48
First National Real Estate O'Donoghues - Darwin     41
                                                  ... 
Ellis Parker Real Estate - LARRAKEYAH                1
Dunvegan Real Estate - PALMERSTON                    1
Australian Home Partners                             1
buymyplace                                           1
Mercury Real Estate                                  1
Name: listing_agency, Length: 85, dtype: int64
________________________________________
Buy    1000
Name: location_type, dtype: int64
________________________________________
UNDER CONTRACT          100
Openn Negotiation        19
FASTRAK                  15
Under Contract           11
UNDER OFFER               9
                       ... 
PRICE GUIDE $490,000      1
PRICE GUIDE $510,000      1
New to Market             1
OFFERS OVER $690,000      1
$655,000                  1
Name: location_name, Length: 494, dtype: int64
________________________________________
Darwin City         285
Stuart Park          39
Rosebery             37
Bakewell             31
Durack               30
Zuccoli              29
Woodroffe            27
Nightcliff           27
Driver               26
Parap                26
Rapid Creek          25
Bellamack            23
Humpty Doo           20
Johnston             20
Leanyer              19
Gunn                 19
Gray                 19
Karama               16
Moulden              15
Howard Springs       15
Berrimah             15
Bayview              14
Fannie Bay           14
Farrar               12
Coconut Grove        12
Muirhead             12
The Gardens          11
Lyons                10
Millner              10
Woolner               9
Jingili               9
Herbert               9
Tiwi                  9
Larrakeyah            9
Ludmilla              7
Alawa                 7
Anula                 7
Wagaman               7
Malak                 7
Wulagi                6
Virginia              6
Brinkin               6
Wanguri               6
Berry Springs         6
Moil                  5
Lee Point             4
Nakara                4
Marrara               3
Coolalinga            3
Girraween             3
Bees Creek            3
Cullen Bay            3
The Narrows           1
Knuckey Lagoon        1
Rosebery Heights      1
Marlow Lagoon         1
Name: city, dtype: int64
________________________________________
NT    1000
Name: state, dtype: int64
________________________________________
08 8943 3000    146
08 8946 0500     62
08 8931 5000     53
08 8941 8941     48
08 8942 8942     41
               ... 
0476448709        1
08 8941 1970      1
0408 952 595      1
0889481153        1
0449636436        1
Name: phone, Length: 84, dtype: int64
________________________________________
premiere    659
feature     172
standard    162
midtier       7
Name: product_depth, dtype: int64
________________________________________
2022-05-27 15:54:05    1000
Name: RunDate, dtype: int64
________________________________________


sns.displot(df['price'])

<seaborn.axisgrid.FacetGrid at 0x232ff0a5460>


plt.figure(figsize = (7, 4))
plt.title('Graph')
plt.xlabel('bedroom_count')
plt.ylabel('price')

plt.scatter(df['bedroom_count'],df['price'],s=30,alpha=1,color= 'g')

<matplotlib.collections.PathCollection at 0x23280114850>


plt.figure(figsize = (7, 4))
plt.title('Graph')
plt.xlabel('Bathroom Count')
plt.ylabel('price')

plt.scatter(df['bathroom_count'],df['price'],s=30,alpha=1,color= 'y')

<matplotlib.collections.PathCollection at 0x23280187790>


plt.figure(figsize = (7, 4))
plt.title('Graph')
plt.xlabel('Bedroom Count')
plt.ylabel('price')

plt.scatter(df['bedroom_count'],df['price'],s=30,alpha=1,color= 'r')

<matplotlib.collections.PathCollection at 0x23280150520>


plt.figure(figsize = (7, 4))
plt.title('Graph')
plt.xlabel('Parking Count')
plt.ylabel('price')

plt.scatter(df['parking_count'],df['price'],s=30,alpha=1,color= 'c')

<matplotlib.collections.PathCollection at 0x232802542e0>


plt.figure(figsize = (7, 4))
plt.title('Graph')
plt.xlabel('Category')
plt.ylabel('price')

plt.scatter(df['category_name'],df['price'],s=30,alpha=1,color= 'k')

<matplotlib.collections.PathCollection at 0x232802b5b20>


plt.figure(figsize = (20, 6))
plt.title('Graph')
plt.xlabel('Property type')
plt.ylabel('price')

plt.scatter(df['property_type'],df['price'],s=30,alpha=1,color= 'b')

<matplotlib.collections.PathCollection at 0x232812dccd0>


plt.figure(figsize = (15, 6))
plt.title('Graph')
plt.xlabel('Land Size')
plt.ylabel('price')

plt.scatter(df['land_size'],df['price'],s=30,alpha=1,color= 'g')

<matplotlib.collections.PathCollection at 0x2328134fe20>


plt.figure(figsize = (15, 6))
plt.title('Graph')
plt.xlabel('Building Size')
plt.ylabel('price')

plt.scatter(df['building_size'],df['price'],s=30,alpha=1,color= 'y')

<matplotlib.collections.PathCollection at 0x23281589be0>


plt.figure(figsize = (15, 6))
plt.title('Graph')
plt.xlabel('Preferred Size')
plt.ylabel('price')

plt.scatter(df['preferred_size'],df['price'],s=30,alpha=1,color= 'c')

<matplotlib.collections.PathCollection at 0x23281905a00>


object_data = df.select_dtypes(include=['object'])
#to convert data of object type to numbers
encoder = preprocessing.LabelEncoder()
for i in range(object_data.shape[1]):
    object_data.iloc[:, i] = encoder.fit_transform(object_data.iloc[:, i])

#concat between data none object and data object after convert it
num_data = df.select_dtypes(exclude=['object'])
df = pd.concat([object_data, num_data], axis=1)


x = df.iloc[:, 0:22]
y = df.iloc[:, 16:17]
x = x.drop(['price'], axis = 1)

x

y


# Data Normalizing
minmax=preprocessing.MinMaxScaler(feature_range=(0,1))
minmax.fit(x).transform(x)

array([[1.        , 0.        , 0.33333333, ..., 0.22222222, 0.        ,
        0.08333333],
       [1.        , 0.        , 0.08333333, ..., 0.33333333, 0.25      ,
        0.16666667],
       [1.        , 0.        , 0.83333333, ..., 0.22222222, 0.        ,
        0.08333333],
       ...,
       [0.        , 1.        , 0.33333333, ..., 0.44444444, 0.25      ,
        0.16666667],
       [0.        , 1.        , 0.33333333, ..., 0.44444444, 0.25      ,
        0.25      ],
       [0.        , 1.        , 0.83333333, ..., 0.22222222, 0.25      ,
        0.16666667]])


x_train, x_test, y_train, y_test = train_test_split(x, y, train_size=0.8, random_state=100, shuffle =True)


dtc_clf = DecisionTreeClassifier()
dtc_clf.fit(x_train,y_train)

DecisionTreeClassifier()


y_pred = dtc_clf.predict(x_test)


print('Decision Train Score is : ' , dtc_clf.score(x_train, y_train))
print('Decision Test Score is : ' , dtc_clf.score(x_test, y_test))

Decision Train Score is :  1.0
Decision Test Score is :  0.505


MSEValue = mean_squared_error(y_test, y_pred, multioutput='raw_values') 
print('Mean Squared Error Value is : ', np.sqrt(MSEValue))

Mean Squared Error Value is :  [26.46157592]


GBRModel = GradientBoostingRegressor(n_estimators=100,max_depth=10,learning_rate = 0.5 ,random_state=44)
GBRModel.fit(x_train, y_train)

print('GBRModel Train Score is : ' , GBRModel.score(x_train, y_train))
print('GBRModel Test Score is : ' , GBRModel.score(x_test, y_test))

GBRModel Train Score is :  1.0
GBRModel Test Score is :  0.9999411939994095


y_pred = GBRModel.predict(x_test)
MSEValue = mean_squared_error(y_test, y_pred, multioutput='raw_values') 
print('Mean Squared Error Value is : ', np.sqrt(MSEValue))

Mean Squared Error Value is :  [1.24858626]


from sklearn.ensemble import RandomForestClassifier
rmf = RandomForestClassifier(max_depth=10, random_state=2)
rmf_clf = rmf.fit(x_train, y_train)


print('RFC Train Score is : ', rmf_clf.score(x_train, y_train))
print('RFC Test Score is : ', rmf_clf.score(x_test, y_test))

RFC Train Score is :  0.99125
RFC Test Score is :  0.265


y_pred = rmf_clf.predict(x_test)
MSEValue = mean_squared_error(y_test, y_pred, multioutput='raw_values') 
print('Mean Squared Error Value is : ', np.sqrt(MSEValue))

Mean Squared Error Value is :  [94.72338676]


pipe_rf = Pipeline([('scl', MinMaxScaler()), 
                    ('clf', RandomForestClassifier(random_state=42))])


pipe_dt = Pipeline([('scl', StandardScaler()),
                    ('clf', DecisionTreeClassifier(random_state=42))])


pipe_gbr = Pipeline([('scl', MinMaxScaler()),
                     ('clf', GradientBoostingRegressor(random_state=42))])


pipelines = [pipe_rf, pipe_dt, pipe_gbr]

pipelines[1]

Pipeline(steps=[('scl', StandardScaler()),
                ('clf', DecisionTreeClassifier(random_state=42))])


pipe_dict = {0: 'Random Forest Classifier', 1: 'Decision Tree', 2: 'Gradient Boosting'}


for pipe in pipelines:
	pipe.fit(x_train, y_train)


for idx, val in enumerate(pipelines):
	print('%s pipeline test accuracy: %.2f' % 
          (pipe_dict[idx], val.score(x_test, y_test)))

Random Forest Classifier pipeline test accuracy: 0.30
Decision Tree pipeline test accuracy: 0.49
Gradient Boosting pipeline test accuracy: 1.00


best_acc = 0.0
best_clf = 0
best_pipe = ''

for idx, val in enumerate(pipelines):
    if val.score(x_test, y_test) > best_acc:
        best_acc = val.score(x_test, y_test)
        best_pipe = val
        best_clf = idx

print('Classifier with best accuracy: %s' % pipe_dict[best_clf])

Classifier with best accuracy: Gradient Boosting

	index	TID	breadcrumb	category_name	property_type	building_size	land_size	preferred_size	open_date	listing_agency	...	state	zip_code	phone	latitude	longitude	product_depth	bedroom_count	bathroom_count	parking_count	RunDate
0	0	1350988	Buy>NT>DARWIN CITY	Real Estate & Property for sale in DARWIN CITY...	House	NaN	NaN	NaN	Added 2 hours ago	Professionals - DARWIN CITY	...	NT	800	08 8941 8289	NaN	NaN	premiere	2.0	1.0	1.0	2022-05-27 15:54:05
1	1	1350989	Buy>NT>DARWIN CITY	Real Estate & Property for sale in DARWIN CITY...	Apartment	171m²	NaN	171m²	Added 7 hours ago	Nick Mousellis Real Estate - Eview Group Member	...	NT	800	0411724000	NaN	NaN	premiere	3.0	2.0	2.0	2022-05-27 15:54:05
2	2	1350990	Buy>NT>DARWIN CITY	Real Estate & Property for sale in DARWIN CITY...	Unit	NaN	NaN	NaN	Added 22 hours ago	Habitat Real Estate - THE GARDENS	...	NT	800	08 8981 0080	NaN	NaN	premiere	2.0	1.0	1.0	2022-05-27 15:54:05
3	3	1350991	Buy>NT>DARWIN CITY	Real Estate & Property for sale in DARWIN CITY...	House	NaN	NaN	NaN	Added yesterday	Ray White - NIGHTCLIFF	...	NT	800	08 8982 2403	NaN	NaN	premiere	1.0	1.0	0.0	2022-05-27 15:54:05
4	4	1350992	Buy>NT>DARWIN CITY	Real Estate & Property for sale in DARWIN CITY...	Unit	201m²	NaN	201m²	Added yesterday	Carol Need Real Estate - Fannie Bay	...	NT	800	0418885966	NaN	NaN	premiere	3.0	2.0	2.0	2022-05-27 15:54:05
...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...
995	995	1351983	Buy>NT>DARWIN	Real Estate & Property for sale in DARWIN, NT ...	House	NaN	9.17ha	9.17ha	Under offer	United Realty NT - Parap	...	NT	834	08 8981 2666	NaN	NaN	feature	4.0	3.0	6.0	2022-05-27 15:54:05
996	996	1351984	Buy>NT>DARWIN	Real Estate & Property for sale in DARWIN, NT ...	House	203m²	600m²	600m²	NaN	Kassiou Constructions - HOWARD SPRINGS	...	NT	836	08 89834326	NaN	NaN	standard	4.0	2.0	2.0	2022-05-27 15:54:05
997	997	1351985	Buy>NT>DARWIN	Real Estate & Property for sale in DARWIN, NT ...	House	209.6m²	800m²	800m²	NaN	Kassiou Constructions - HOWARD SPRINGS	...	NT	836	08 89834326	NaN	NaN	standard	4.0	2.0	2.0	2022-05-27 15:54:05
998	998	1351986	Buy>NT>DARWIN	Real Estate & Property for sale in DARWIN, NT ...	House	180m²	450m²	450m²	NaN	Kassiou Constructions - HOWARD SPRINGS	...	NT	810	08 89834326	NaN	NaN	standard	4.0	2.0	3.0	2022-05-27 15:54:05
999	999	1351987	Buy>NT>DARWIN	Real Estate & Property for sale in DARWIN, NT ...	Unit	120m²	NaN	120m²	NaN	Home Zone NT - DARWIN	...	NT	820	0418 895 345	NaN	NaN	feature	2.0	2.0	2.0	2022-05-27 15:54:05

	index	TID	breadcrumb	category_name	property_type	building_size	land_size	preferred_size	open_date	listing_agency	...	state	zip_code	phone	latitude	longitude	product_depth	bedroom_count	bathroom_count	parking_count	RunDate
995	995	1351983	Buy>NT>DARWIN	Real Estate & Property for sale in DARWIN, NT ...	House	NaN	9.17ha	9.17ha	Under offer	United Realty NT - Parap	...	NT	834	08 8981 2666	NaN	NaN	feature	4.0	3.0	6.0	2022-05-27 15:54:05
996	996	1351984	Buy>NT>DARWIN	Real Estate & Property for sale in DARWIN, NT ...	House	203m²	600m²	600m²	NaN	Kassiou Constructions - HOWARD SPRINGS	...	NT	836	08 89834326	NaN	NaN	standard	4.0	2.0	2.0	2022-05-27 15:54:05
997	997	1351985	Buy>NT>DARWIN	Real Estate & Property for sale in DARWIN, NT ...	House	209.6m²	800m²	800m²	NaN	Kassiou Constructions - HOWARD SPRINGS	...	NT	836	08 89834326	NaN	NaN	standard	4.0	2.0	2.0	2022-05-27 15:54:05
998	998	1351986	Buy>NT>DARWIN	Real Estate & Property for sale in DARWIN, NT ...	House	180m²	450m²	450m²	NaN	Kassiou Constructions - HOWARD SPRINGS	...	NT	810	08 89834326	NaN	NaN	standard	4.0	2.0	3.0	2022-05-27 15:54:05
999	999	1351987	Buy>NT>DARWIN	Real Estate & Property for sale in DARWIN, NT ...	Unit	120m²	NaN	120m²	NaN	Home Zone NT - DARWIN	...	NT	820	0418 895 345	NaN	NaN	feature	2.0	2.0	2.0	2022-05-27 15:54:05

Australian Housing Prices prediction¶

Data Analysis¶

Decision Tree Classifier¶

Gradient Boosting Model¶

Random Forest Classifier¶

Pipelining¶

	breadcrumb	category_name	property_type	listing_agency	location_type	location_name	city	state	phone	product_depth	...	TID	building_size	land_size	preferred_size	open_date	location_number	zip_code	bedroom_count	bathroom_count	parking_count
0	1	0	4	56	0	107	12	0	39	2	...	1350988	169	346	376	2.000000	139468611	800	2.0	1.0	1.0
1	1	0	1	49	0	322	12	0	7	2	...	1350989	41	346	57	13.000000	139463755	800	3.0	2.0	2.0
2	1	0	10	35	0	59	12	0	52	2	...	1350990	169	346	376	14.000000	139462495	800	2.0	1.0	1.0
3	1	0	4	63	0	37	12	0	57	2	...	1350991	169	346	376	8.000000	139451679	800	1.0	1.0	0.0
4	1	0	10	12	0	108	12	0	15	2	...	1350992	69	346	90	8.000000	139433803	800	3.0	2.0	2.0
...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...
995	0	1	4	78	0	237	49	0	53	0	...	1351983	169	325	350	0.000000	138346247	834	4.0	3.0	6.0
996	0	1	4	41	0	178	22	0	65	3	...	1351984	70	188	207	2.592715	138333062	836	4.0	2.0	2.0
997	0	1	4	41	0	194	22	0	65	3	...	1351985	74	262	284	2.592715	138333058	836	4.0	2.0	2.0
998	0	1	4	41	0	200	29	0	65	3	...	1351986	48	160	175	2.592715	138333050	810	4.0	2.0	3.0
999	0	1	10	37	0	95	45	0	14	0	...	1351987	12	346	30	2.592715	138330946	820	2.0	2.0	2.0