<- c('sf', 'tidyverse', 'tmap', 'httr', 'jsonlite', 'rvest',
packages 'sp', 'ggpubr', 'corrplot', 'broom', 'olsrr', 'spdep',
'GWmodel', 'devtools', 'rgeos', 'lwgeom', 'maptools', 'matrixStats',
'units', 'gtsummary', 'Metrics', 'rsample', 'SpatialML')
for(p in packages){
if(!require(p, character.only = T)){
install.packages(p, repos = "http://cran.us.r-project.org")
}library(p, character.only = T)
}
Take-home Exercise 3: Predicting HDB Public Housing Resale Pricies using Geographically Weighted Methods
1 Overview
1.1 Setting the Scene
Housing is an essential component of household wealth worldwide. Buying a housing has always been a major investment for most people. The price of housing is affected by many factors. Some of them are global in nature such as the general economy of a country or inflation rate. Others can be more specific to the properties themselves. These factors can be further divided to structural and locational factors. Structural factors are variables related to the property themselves such as the size, fitting, and tenure of the property. Locational factors are variables related to the neighbourhood of the properties such as proximity to childcare centre, public transport service and shopping centre.
Conventional, housing resale prices predictive models were built by using Ordinary Least Square (OLS) method. However, this method failed to take into consideration that spatial autocorrelation and spatial heterogeneity exist in geographic data sets such as housing transactions. With the existence of spatial autocorrelation, the OLS estimation of predictive housing resale pricing models could lead to biased, inconsistent, or inefficient results (Anselin 1998). In view of this limitation, Geographical Weighted Models were introduced for calibrating predictive model for housing resale prices.
1.2 The Task
In this take-home exercise, you are tasked to predict HDB resale prices at the sub-market level (i.e. HDB 3-room, HDB 4-room and HDB 5-room) for the month of January and February 2023 in Singapore. The predictive models must be built by using by using conventional OLS method and GWR methods. You are also required to compare the performance of the conventional OLS method versus the geographical weighted methods.
1.3 The Data
For the purpose of this take-home exercise, HDB Resale Flat Prices
provided by Data.gov.sg should be used as the core data set. The study should focus on either three-room, four-room or five-room flat and transaction period should be from 1st January 2021 to 31st December 2022. The test data should be January and February 2023 resale prices.
Aspatial Data Set | Source |
---|---|
HDB Resale Data | Data.gov.sg |
Geospatial Data Set | Source |
---|---|
Master Plan 2019 Sub-Zone Boundary | Prof. Kam |
Hawker Centres | OneMap |
Childcare Centres | OneMap |
Kindergartens | OneMap |
NParks Parks and Nature Reserves | OneMap |
Shopping Malls | Wikipedia; GitHub - ValaryLim |
Supermarket | Data.gov.sg |
Eldercare | Data.gov.sg |
MRT Stations | LTA DataMall |
Bus Stops | LTA DataMall |
1.4 Acknowledgement
Name | Source |
---|---|
Prof. Kam | Hands-On Ex 8; In-Class Ex 9 |
Megan Sim | Take-Home Ex 3 |
Nor Aisyah Binte Ajit | Take-Home Ex 3 |
2 Getting Started
2.1 Import Packages
2.2 Importing Aspatial Data
<- read_csv("data/aspatial/resale-flat-prices-based-on-registration-date-from-jan-2017-onwards.csv") resale
head(resale)
# A tibble: 6 × 11
month town flat_…¹ block stree…² store…³ floor…⁴ flat_…⁵ lease…⁶ remai…⁷
<chr> <chr> <chr> <chr> <chr> <chr> <dbl> <chr> <dbl> <chr>
1 2017-01 ANG MO … 2 ROOM 406 ANG MO… 10 TO … 44 Improv… 1979 61 yea…
2 2017-01 ANG MO … 3 ROOM 108 ANG MO… 01 TO … 67 New Ge… 1978 60 yea…
3 2017-01 ANG MO … 3 ROOM 602 ANG MO… 01 TO … 67 New Ge… 1980 62 yea…
4 2017-01 ANG MO … 3 ROOM 465 ANG MO… 04 TO … 68 New Ge… 1980 62 yea…
5 2017-01 ANG MO … 3 ROOM 601 ANG MO… 01 TO … 67 New Ge… 1980 62 yea…
6 2017-01 ANG MO … 3 ROOM 150 ANG MO… 01 TO … 68 New Ge… 1981 63 yea…
# … with 1 more variable: resale_price <dbl>, and abbreviated variable names
# ¹flat_type, ²street_name, ³storey_range, ⁴floor_area_sqm, ⁵flat_model,
# ⁶lease_commence_date, ⁷remaining_lease
2.2.1 Filter Resale Data
Resale Data filter includes both training data and test data set
<- filter(resale, flat_type == "4 ROOM") %>%
resale_filter filter(month >= "2021-01" & month <= "2023-02")
2.2.1.1 Glimpse Filtered Resale Data
glimpse(resale_filter)
Rows: 25,502
Columns: 11
$ month <chr> "2021-01", "2021-01", "2021-01", "2021-01", "2021-…
$ town <chr> "ANG MO KIO", "ANG MO KIO", "ANG MO KIO", "ANG MO …
$ flat_type <chr> "4 ROOM", "4 ROOM", "4 ROOM", "4 ROOM", "4 ROOM", …
$ block <chr> "547", "414", "509", "467", "571", "134", "204", "…
$ street_name <chr> "ANG MO KIO AVE 10", "ANG MO KIO AVE 10", "ANG MO …
$ storey_range <chr> "04 TO 06", "01 TO 03", "01 TO 03", "07 TO 09", "0…
$ floor_area_sqm <dbl> 92, 92, 91, 92, 92, 98, 92, 92, 92, 92, 92, 109, 9…
$ flat_model <chr> "New Generation", "New Generation", "New Generatio…
$ lease_commence_date <dbl> 1981, 1979, 1980, 1979, 1979, 1978, 1977, 1978, 19…
$ remaining_lease <chr> "59 years", "57 years 09 months", "58 years 06 mon…
$ resale_price <dbl> 370000, 375000, 380000, 385000, 410000, 410000, 41…
2.2.1.2 Check for Correct Time Period
unique(resale_filter$month)
[1] "2021-01" "2021-02" "2021-03" "2021-04" "2021-05" "2021-06" "2021-07"
[8] "2021-08" "2021-09" "2021-10" "2021-11" "2021-12" "2022-01" "2022-02"
[15] "2022-03" "2022-04" "2022-05" "2022-06" "2022-07" "2022-08" "2022-09"
[22] "2022-10" "2022-11" "2022-12" "2023-01" "2023-02"
2.2.1.3 Check for Correct Flat Type
unique(resale_filter$flat_type)
[1] "4 ROOM"
2.2.2 Transform Resale Data
2.2.2.1 Adding New Columns
<- resale_filter %>%
resale_transform mutate(resale_filter, address = paste(block,street_name)) %>%
mutate(resale_filter, remaining_lease_yr = as.integer(str_sub(remaining_lease, 0, 2))) %>%
mutate(resale_filter, remaining_lease_mth = as.integer(str_sub(remaining_lease, 9, 11)))
2.2.2.2 Replace NA Values in “remaining_lease_mth”
$remaining_lease_mth[is.na(resale_transform$remaining_lease_mth)] <- 0 resale_transform
2.2.2.3 Convert “remaining_lease_yr” to Months
$remaining_lease_yr <- resale_transform$remaining_lease_yr * 12
resale_transform<- resale_transform %>%
resale_transform mutate(resale_transform, remaining_lease_mths = rowSums(resale_transform[, c("remaining_lease_yr", "remaining_lease_mth")])) %>%
select(month, town, address, block, street_name, flat_type, storey_range, floor_area_sqm, flat_model,
lease_commence_date, remaining_lease_mths, resale_price)
2.2.3 Getting Unique Addresses
<- sort(unique(resale_transform$address)) address
head(address)
[1] "1 CHAI CHEE RD" "1 PINE CL" "1 ST. GEORGE'S RD"
[4] "1 TECK WHYE AVE" "1 TOH YI DR" "10 CHAI CHEE RD"
2.2.4 Getting LAT & LONG from OneMap.sg API
<- function(add_list){
get_coords
# Create a data frame to store all retrieved coordinates
<- data.frame()
postal_coords
for (i in add_list){
#print(i)
<- GET('https://developers.onemap.sg/commonapi/search?',
r query=list(searchVal=i,
returnGeom='Y',
getAddrDetails='Y'))
<- fromJSON(rawToChar(r$content))
data <- data$found
found <- data$results
res
# Create a new data frame for each address
<- data.frame()
new_row
# If single result, append
if (found == 1){
<- res$POSTAL
postal <- res$LATITUDE
lat <- res$LONGITUDE
lng <- data.frame(address= i, postal = postal, latitude = lat, longitude = lng)
new_row
}
# If multiple results, drop NIL and append top 1
else if (found > 1){
# Remove those with NIL as postal
<- res[res$POSTAL != "NIL", ]
res_sub
# Set as NA first if no Postal
if (nrow(res_sub) == 0) {
<- data.frame(address= i, postal = NA, latitude = NA, longitude = NA)
new_row
}
else{
<- head(res_sub, n = 1)
top1 <- top1$POSTAL
postal <- top1$LATITUDE
lat <- top1$LONGITUDE
lng <- data.frame(address= i, postal = postal, latitude = lat, longitude = lng)
new_row
}
}
else {
<- data.frame(address= i, postal = NA, latitude = NA, longitude = NA)
new_row
}
# Add the row
<- rbind(postal_coords, new_row)
postal_coords
}return(postal_coords)
}
2.2.4.1 Calling Function
<- get_coords(address) latlong
2.2.4.2 Check for Missing Value
is.na(latlong))] latlong[(
character(0)
2.2.5 Combine Resale and LAT & LONG
<- left_join(resale_transform, latlong, by = c('address' = 'address')) resale_latlong
head(resale_latlong)
# A tibble: 6 × 15
month town address block stree…¹ flat_…² store…³ floor…⁴ flat_…⁵ lease…⁶
<chr> <chr> <chr> <chr> <chr> <chr> <chr> <dbl> <chr> <dbl>
1 2021-01 ANG MO … 547 AN… 547 ANG MO… 4 ROOM 04 TO … 92 New Ge… 1981
2 2021-01 ANG MO … 414 AN… 414 ANG MO… 4 ROOM 01 TO … 92 New Ge… 1979
3 2021-01 ANG MO … 509 AN… 509 ANG MO… 4 ROOM 01 TO … 91 New Ge… 1980
4 2021-01 ANG MO … 467 AN… 467 ANG MO… 4 ROOM 07 TO … 92 New Ge… 1979
5 2021-01 ANG MO … 571 AN… 571 ANG MO… 4 ROOM 07 TO … 92 New Ge… 1979
6 2021-01 ANG MO … 134 AN… 134 ANG MO… 4 ROOM 07 TO … 98 New Ge… 1978
# … with 5 more variables: remaining_lease_mths <dbl>, resale_price <dbl>,
# postal <chr>, latitude <chr>, longitude <chr>, and abbreviated variable
# names ¹street_name, ²flat_type, ³storey_range, ⁴floor_area_sqm,
# ⁵flat_model, ⁶lease_commence_date
2.2.6 Check for NA
is.na(resale_latlong))] resale_latlong[(
<unspecified> [0]
2.2.7 Write File to rds
<- write_rds(resale_latlong, "data/model/resale_latlong.rds") resale_latlong.rds
2.2.8 Read resale_rds File
<- read_rds("data/model/resale_latlong.rds") resale_main
head(resale_main)
# A tibble: 6 × 15
month town address block stree…¹ flat_…² store…³ floor…⁴ flat_…⁵ lease…⁶
<chr> <chr> <chr> <chr> <chr> <chr> <chr> <dbl> <chr> <dbl>
1 2021-01 ANG MO … 547 AN… 547 ANG MO… 4 ROOM 04 TO … 92 New Ge… 1981
2 2021-01 ANG MO … 414 AN… 414 ANG MO… 4 ROOM 01 TO … 92 New Ge… 1979
3 2021-01 ANG MO … 509 AN… 509 ANG MO… 4 ROOM 01 TO … 91 New Ge… 1980
4 2021-01 ANG MO … 467 AN… 467 ANG MO… 4 ROOM 07 TO … 92 New Ge… 1979
5 2021-01 ANG MO … 571 AN… 571 ANG MO… 4 ROOM 07 TO … 92 New Ge… 1979
6 2021-01 ANG MO … 134 AN… 134 ANG MO… 4 ROOM 07 TO … 98 New Ge… 1978
# … with 5 more variables: remaining_lease_mths <dbl>, resale_price <dbl>,
# postal <chr>, latitude <chr>, longitude <chr>, and abbreviated variable
# names ¹street_name, ²flat_type, ³storey_range, ⁴floor_area_sqm,
# ⁵flat_model, ⁶lease_commence_date
2.2.9 Transform to sf and Assign CRS
<- st_as_sf(resale_main,
resale_main_sf coords = c("longitude",
"latitude"),
crs=4326) %>%
st_transform(crs = 3414)
st_crs(resale_main_sf)
Coordinate Reference System:
User input: EPSG:3414
wkt:
PROJCRS["SVY21 / Singapore TM",
BASEGEOGCRS["SVY21",
DATUM["SVY21",
ELLIPSOID["WGS 84",6378137,298.257223563,
LENGTHUNIT["metre",1]]],
PRIMEM["Greenwich",0,
ANGLEUNIT["degree",0.0174532925199433]],
ID["EPSG",4757]],
CONVERSION["Singapore Transverse Mercator",
METHOD["Transverse Mercator",
ID["EPSG",9807]],
PARAMETER["Latitude of natural origin",1.36666666666667,
ANGLEUNIT["degree",0.0174532925199433],
ID["EPSG",8801]],
PARAMETER["Longitude of natural origin",103.833333333333,
ANGLEUNIT["degree",0.0174532925199433],
ID["EPSG",8802]],
PARAMETER["Scale factor at natural origin",1,
SCALEUNIT["unity",1],
ID["EPSG",8805]],
PARAMETER["False easting",28001.642,
LENGTHUNIT["metre",1],
ID["EPSG",8806]],
PARAMETER["False northing",38744.572,
LENGTHUNIT["metre",1],
ID["EPSG",8807]]],
CS[Cartesian,2],
AXIS["northing (N)",north,
ORDER[1],
LENGTHUNIT["metre",1]],
AXIS["easting (E)",east,
ORDER[2],
LENGTHUNIT["metre",1]],
USAGE[
SCOPE["Cadastre, engineering survey, topographic mapping."],
AREA["Singapore - onshore and offshore."],
BBOX[1.13,103.59,1.47,104.07]],
ID["EPSG",3414]]
2.2.10 Check for Invalid Geometries
length(which(st_is_valid(resale_main_sf) == FALSE))
[1] 0
2.3 Importing Geospatial Data
Importing Master Plan 2019 Subzone
<- st_read(dsn = "data/geospatial", layer = "MPSZ-2019") mpsz
Reading layer `MPSZ-2019' from data source
`C:\Users\la935\Desktop\IS415 - GAA\IS415 - GAA (New)\Take-Home_Ex\Take-Home_Ex03\data\geospatial'
using driver `ESRI Shapefile'
Simple feature collection with 332 features and 6 fields
Geometry type: MULTIPOLYGON
Dimension: XY
Bounding box: xmin: 103.6057 ymin: 1.158699 xmax: 104.0885 ymax: 1.470775
Geodetic CRS: WGS 84
Check for Invalid Geometry
length(which(st_is_valid(mpsz) == FALSE))
[1] 6
Make Geometry Valid and Check for Invalid Geometry Again
<- st_make_valid(mpsz)
mpsz length(which(st_is_valid(mpsz) == FALSE))
[1] 0
Assign ESPG Code
<- st_transform(mpsz, 3414)
mpsz st_crs(mpsz)
Coordinate Reference System:
User input: EPSG:3414
wkt:
PROJCRS["SVY21 / Singapore TM",
BASEGEOGCRS["SVY21",
DATUM["SVY21",
ELLIPSOID["WGS 84",6378137,298.257223563,
LENGTHUNIT["metre",1]]],
PRIMEM["Greenwich",0,
ANGLEUNIT["degree",0.0174532925199433]],
ID["EPSG",4757]],
CONVERSION["Singapore Transverse Mercator",
METHOD["Transverse Mercator",
ID["EPSG",9807]],
PARAMETER["Latitude of natural origin",1.36666666666667,
ANGLEUNIT["degree",0.0174532925199433],
ID["EPSG",8801]],
PARAMETER["Longitude of natural origin",103.833333333333,
ANGLEUNIT["degree",0.0174532925199433],
ID["EPSG",8802]],
PARAMETER["Scale factor at natural origin",1,
SCALEUNIT["unity",1],
ID["EPSG",8805]],
PARAMETER["False easting",28001.642,
LENGTHUNIT["metre",1],
ID["EPSG",8806]],
PARAMETER["False northing",38744.572,
LENGTHUNIT["metre",1],
ID["EPSG",8807]]],
CS[Cartesian,2],
AXIS["northing (N)",north,
ORDER[1],
LENGTHUNIT["metre",1]],
AXIS["easting (E)",east,
ORDER[2],
LENGTHUNIT["metre",1]],
USAGE[
SCOPE["Cadastre, engineering survey, topographic mapping."],
AREA["Singapore - onshore and offshore."],
BBOX[1.13,103.59,1.47,104.07]],
ID["EPSG",3414]]
2.3.1 Data With LAT & LONG
2.3.1.1 Elderly Care
<- st_read(dsn = "data/geospatial", layer = "ELDERCARE") elder_sf
Reading layer `ELDERCARE' from data source
`C:\Users\la935\Desktop\IS415 - GAA\IS415 - GAA (New)\Take-Home_Ex\Take-Home_Ex03\data\geospatial'
using driver `ESRI Shapefile'
Simple feature collection with 133 features and 18 fields
Geometry type: POINT
Dimension: XY
Bounding box: xmin: 14481.92 ymin: 28218.43 xmax: 41665.14 ymax: 46804.9
Projected CRS: SVY21
# Assign EPSG Code
<- st_transform(elder_sf, 3414) elder_sf
2.3.1.2 Bus Stops
<- st_read(dsn = "data/geospatial", layer = "BusStop") BusStop_sf
Reading layer `BusStop' from data source
`C:\Users\la935\Desktop\IS415 - GAA\IS415 - GAA (New)\Take-Home_Ex\Take-Home_Ex03\data\geospatial'
using driver `ESRI Shapefile'
Simple feature collection with 5159 features and 3 fields
Geometry type: POINT
Dimension: XY
Bounding box: xmin: 3970.122 ymin: 26482.1 xmax: 48284.56 ymax: 52983.82
Projected CRS: SVY21
# Assign EPSG Code
<- st_transform(BusStop_sf, 3414) BusStop_sf
2.3.1.3 MRT Stations
<- read.csv("data/geospatial/mrtsg.csv")
mrt
<- st_as_sf(mrt,
mrt_sf coords = c("Longitude",
"Latitude"),
crs = 4326) %>%
st_transform(crs = 3414)
2.3.1.4 Childcare Center
<- st_read(dsn = "data/geospatial", layer = "CHILDCARE") childcare_sf
Reading layer `CHILDCARE' from data source
`C:\Users\la935\Desktop\IS415 - GAA\IS415 - GAA (New)\Take-Home_Ex\Take-Home_Ex03\data\geospatial'
using driver `ESRI Shapefile'
Simple feature collection with 1545 features and 15 fields
Geometry type: POINT
Dimension: XY
Bounding box: xmin: 11203.01 ymin: 25667.6 xmax: 45404.24 ymax: 49300.88
Projected CRS: WGS_1984_Transverse_Mercator
# Assign EPSG Code
<- st_transform(childcare_sf, 3414) childcare_sf
2.3.1.5 Kindergarten
<- st_read(dsn = "data/geospatial", layer = "KINDERGARTENS") kindergarten_sf
Reading layer `KINDERGARTENS' from data source
`C:\Users\la935\Desktop\IS415 - GAA\IS415 - GAA (New)\Take-Home_Ex\Take-Home_Ex03\data\geospatial'
using driver `ESRI Shapefile'
Simple feature collection with 448 features and 15 fields
Geometry type: POINT
Dimension: XY
Bounding box: xmin: 11909.7 ymin: 25596.33 xmax: 43395.47 ymax: 48562.06
Projected CRS: SVY21
# Assign EPSG Code
<- st_transform(kindergarten_sf, 3414) kindergarten_sf
2.3.1.6 Parks
<- st_read(dsn = "data/geospatial", layer = "NATIONALPARKS") parks_sf
Reading layer `NATIONALPARKS' from data source
`C:\Users\la935\Desktop\IS415 - GAA\IS415 - GAA (New)\Take-Home_Ex\Take-Home_Ex03\data\geospatial'
using driver `ESRI Shapefile'
Simple feature collection with 352 features and 15 fields
Geometry type: POINT
Dimension: XY
Bounding box: xmin: 12373.11 ymin: 21869.93 xmax: 46735.95 ymax: 49231.09
Projected CRS: SVY21
# Assign EPSG Code
<- st_transform(parks_sf, 3414) parks_sf
2.3.1.6 Hawker Centre
<- st_read(dsn = "data/geospatial", layer = "HAWKERCENTRE") hawker_sf
Reading layer `HAWKERCENTRE' from data source
`C:\Users\la935\Desktop\IS415 - GAA\IS415 - GAA (New)\Take-Home_Ex\Take-Home_Ex03\data\geospatial'
using driver `ESRI Shapefile'
Simple feature collection with 125 features and 21 fields
Geometry type: POINT
Dimension: XY
Bounding box: xmin: 12874.19 ymin: 28355.97 xmax: 45241.4 ymax: 47872.53
Projected CRS: SVY21
# Assign EPSG Code
<- st_transform(hawker_sf, 3414) hawker_sf
2.3.1.7 Supermarkets
<- st_read(dsn = "data/geospatial", layer = "SUPERMARKETS") supermarket_sf
Reading layer `SUPERMARKETS' from data source
`C:\Users\la935\Desktop\IS415 - GAA\IS415 - GAA (New)\Take-Home_Ex\Take-Home_Ex03\data\geospatial'
using driver `ESRI Shapefile'
Simple feature collection with 526 features and 8 fields
Geometry type: POINT
Dimension: XY
Bounding box: xmin: 4901.188 ymin: 25529.08 xmax: 46948.22 ymax: 49233.6
Projected CRS: SVY21
# Assign EPSG Code
<- st_transform(supermarket_sf, 3414) supermarket_sf
2.3.2 Data Without LAT & LONG
2.3.2.1 CBD (Downtown Core, Singapore)
# Storing LAT & LONG for CBD as Dataframe
<- c('CBD')
name = c(1.287953)
latitude = c(103.851784)
longitude <- data.frame(name, latitude, longitude) cbd
# Convert to sf and Assign EPSG
<- st_as_sf(cbd,
cbd_sf coords = c("longitude",
"latitude"),
crs = 4326) %>%
st_transform(crs = 3414)
st_crs(cbd_sf)
Coordinate Reference System:
User input: EPSG:3414
wkt:
PROJCRS["SVY21 / Singapore TM",
BASEGEOGCRS["SVY21",
DATUM["SVY21",
ELLIPSOID["WGS 84",6378137,298.257223563,
LENGTHUNIT["metre",1]]],
PRIMEM["Greenwich",0,
ANGLEUNIT["degree",0.0174532925199433]],
ID["EPSG",4757]],
CONVERSION["Singapore Transverse Mercator",
METHOD["Transverse Mercator",
ID["EPSG",9807]],
PARAMETER["Latitude of natural origin",1.36666666666667,
ANGLEUNIT["degree",0.0174532925199433],
ID["EPSG",8801]],
PARAMETER["Longitude of natural origin",103.833333333333,
ANGLEUNIT["degree",0.0174532925199433],
ID["EPSG",8802]],
PARAMETER["Scale factor at natural origin",1,
SCALEUNIT["unity",1],
ID["EPSG",8805]],
PARAMETER["False easting",28001.642,
LENGTHUNIT["metre",1],
ID["EPSG",8806]],
PARAMETER["False northing",38744.572,
LENGTHUNIT["metre",1],
ID["EPSG",8807]]],
CS[Cartesian,2],
AXIS["northing (N)",north,
ORDER[1],
LENGTHUNIT["metre",1]],
AXIS["easting (E)",east,
ORDER[2],
LENGTHUNIT["metre",1]],
USAGE[
SCOPE["Cadastre, engineering survey, topographic mapping."],
AREA["Singapore - onshore and offshore."],
BBOX[1.13,103.59,1.47,104.07]],
ID["EPSG",3414]]
2.3.2.2 Primary Schools
<- read.csv("data/geospatial/general-information-of-schools.csv")
primary
<- primary %>%
primary filter(mainlevel_code == "PRIMARY") %>%
select(school_name, address, postal_code, mainlevel_code)
glimpse(primary)
Rows: 183
Columns: 4
$ school_name <chr> "ADMIRALTY PRIMARY SCHOOL", "AHMAD IBRAHIM PRIMARY SCHO…
$ address <chr> "11 WOODLANDS CIRCLE", "10 YISHUN STREET 11", "100 …
$ postal_code <int> 738907, 768643, 579646, 159016, 544969, 569785, 569920,…
$ mainlevel_code <chr> "PRIMARY", "PRIMARY", "PRIMARY", "PRIMARY", "PRIMARY", …
Converting postal codes to LAT & LONG
# Store Primary School Postal and Retrieve LAT & LONG
<- unique(primary$postal_code)
primary_postal
<- get_coords(primary_postal) primary_latlong
Check for NA
is.na(primary_latlong))] primary_latlong[(
[1] NA NA NA NA NA NA NA NA NA
Error found for postal with 0 at the start, replace the postal code with 0 at the front
$postal_code[primary$postal_code == '88256'] <- '088256'
primary$postal_code[primary$postal_code == '99757'] <- '099757'
primary$postal_code[primary$postal_code == '99840'] <- '099840' primary
Re-run converting postal code to LAT & LONG
<- unique(primary$postal_code)
primary_postal
<- get_coords(primary_postal) primary_latlong
Check for NA
is.na(primary_latlong))] primary_latlong[(
character(0)
Combine LAT & LONG with primary file
<- left_join(primary, primary_latlong, by = c('postal_code' = 'postal')) primary_school
Convert to sf object and transform CRS
<- st_as_sf(primary_school,
primary_school_sf coords = c("longitude",
"latitude"),
crs = 4326) %>%
st_transform(crs = 3414)
2.3.2.3 Good Primary Schools
Got the top 10 schools based on a website
<- primary_school %>%
good_primary_school filter(school_name %in%
c("PEI HWA PRESBYTERIAN PRIMARY SCHOOL",
"GONGSHANG PRIMARY SCHOOL",
"RIVERSIDE PRIMARY SCHOOL",
"RED SWASTIKA SCHOOL",
"PUNGGOL GREEN PRIMARY SCHOOL",
"PRINCESS ELIZABETH PRIMARY SCHOOL",
"WESTWOOD PRIMARY SCHOOL",
"AI TONG SCHOOL",
"FRONTIER PRIMARY SCHOOL",
"OASIS PRIMARY SCHOOL"))
Convert to sf object and transform CRS
<- st_as_sf(good_primary_school,
good_primary_school_sf coords = c("longitude",
"latitude"),
crs = 4326) %>%
st_transform(crs = 3414)
2.3.2.4 Shopping Malls
<- read.csv("data/geospatial/mall_coordinates.csv")
shopping
<- shopping %>%
shopping select(name, latitude, longitude)
glimpse(shopping)
Rows: 199
Columns: 3
$ name <chr> "100 AM", "i12 Katong", "313@SOMERSET", "321 CLEMENTI", "600…
$ latitude <dbl> 1.274588, 1.305087, 1.301385, 1.312025, 1.334042, 1.437131, …
$ longitude <dbl> 103.8435, 103.9051, 103.8377, 103.7650, 103.8510, 103.7953, …
<- st_as_sf(shopping,
shopping_sf coords = c("longitude",
"latitude"),
crs = 4326) %>%
st_transform(crs = 3414)
3 Proximity Calculation
3.1 Calculation Function
<- function(df1, df2, col_name) {
prox_cal <- st_distance(df1, df2)
dist_matrix <- rowMins(dist_matrix) / 1000
df1[,col_name] return(df1)
}
3.2 Calculation of Locational Factors
<- prox_cal(resale_main_sf, cbd_sf, "PROX_CBD")
resale_main_sf <- prox_cal(resale_main_sf, BusStop_sf, "PROX_BUS")
resale_main_sf <- prox_cal(resale_main_sf, childcare_sf, "PROX_CHILDCARE")
resale_main_sf <- prox_cal(resale_main_sf, elder_sf, "PROX_ELDERCARE")
resale_main_sf <- prox_cal(resale_main_sf, hawker_sf, "PROX_HAWKER")
resale_main_sf <- prox_cal(resale_main_sf, good_primary_school_sf, "PROX_GOODPRIMARY")
resale_main_sf <- prox_cal(resale_main_sf, parks_sf, "PROX_PARK")
resale_main_sf <- prox_cal(resale_main_sf, supermarket_sf, "PROX_SUPERMARKET")
resale_main_sf <- prox_cal(resale_main_sf, shopping_sf, "PROX_SHOPPING")
resale_main_sf <- prox_cal(resale_main_sf, mrt_sf, "PROX_MRT") resale_main_sf
3.3 Calculation Function With Radius
<- function(df1, df2, col_name, radius) {
prox_cal_radius <- st_distance(df1, df2) %>%
dist_matrix drop_units() %>%
as.data.frame()
<- rowSums(dist_matrix <= radius)
df1[,col_name] return(df1)
}
3.4 Calculation of Locational Factors With Radius
<- prox_cal_radius(resale_main_sf, kindergarten_sf, "WITHIN_350M_KINDERGARTEN", 350)
resale_main_sf <- prox_cal_radius(resale_main_sf, childcare_sf, "WITHIN_350M_CHILDCARE", 350)
resale_main_sf <- prox_cal_radius(resale_main_sf, BusStop_sf, "WITHIN_350M_BUS", 350)
resale_main_sf <- prox_cal_radius(resale_main_sf, primary_school_sf, "WITHIN_1KM_PRIMARY", 1000) resale_main_sf
3.5 Saving Dataset
<- resale_main_sf %>%
resale_main_sf mutate() %>%
rename("AREA_SQM" = "floor_area_sqm",
"LEASE_MTHS" = "remaining_lease_mths",
"PRICE" = "resale_price",
"STOREY" = "storey_range")
<- write_rds(resale_main_sf, "data/model/resale_main.rds") resale_main.rds
4 Exploratory Data Analysis (EDA)
4.1 Read resale_main_rds file
<- read_rds("data/model/resale_main.rds") resale_main_sf
glimpse(resale_main_sf)
Rows: 25,502
Columns: 28
$ month <chr> "2021-01", "2021-01", "2021-01", "2021-01", "…
$ town <chr> "ANG MO KIO", "ANG MO KIO", "ANG MO KIO", "AN…
$ address <chr> "547 ANG MO KIO AVE 10", "414 ANG MO KIO AVE …
$ block <chr> "547", "414", "509", "467", "571", "134", "20…
$ street_name <chr> "ANG MO KIO AVE 10", "ANG MO KIO AVE 10", "AN…
$ flat_type <chr> "4 ROOM", "4 ROOM", "4 ROOM", "4 ROOM", "4 RO…
$ STOREY <chr> "04 TO 06", "01 TO 03", "01 TO 03", "07 TO 09…
$ AREA_SQM <dbl> 92, 92, 91, 92, 92, 98, 92, 92, 92, 92, 92, 1…
$ flat_model <chr> "New Generation", "New Generation", "New Gene…
$ lease_commence_date <dbl> 1981, 1979, 1980, 1979, 1979, 1978, 1977, 197…
$ LEASE_MTHS <dbl> 708, 693, 702, 695, 689, 681, 661, 682, 692, …
$ PRICE <dbl> 370000, 375000, 380000, 385000, 410000, 41000…
$ postal <chr> "560547", "560414", "560509", "560467", "5605…
$ geometry <POINT [m]> POINT (30770.07 39578.64), POINT (30292…
$ PROX_CBD <dbl> 9.564575, 8.401690, 9.516492, 8.580908, 9.084…
$ PROX_BUS <dbl> 0.16157609, 0.16740841, 0.07424143, 0.0887911…
$ PROX_CHILDCARE <dbl> 2.493662e-01, 6.715056e-02, 1.385583e-01, 1.4…
$ PROX_ELDERCARE <dbl> 1.08567795, 0.15039052, 0.72242472, 0.0981628…
$ PROX_HAWKER <dbl> 0.4442515, 0.2050009, 0.4495734, 0.3190679, 0…
$ PROX_GOODPRIMARY <dbl> 3.182527, 2.354024, 2.414729, 2.699653, 2.648…
$ PROX_PARK <dbl> 0.8291527, 0.7847864, 0.3796058, 0.9242129, 0…
$ PROX_SUPERMARKET <dbl> 0.4184204, 0.1946009, 0.4435109, 0.4269715, 0…
$ PROX_SHOPPING <dbl> 1.1817959, 0.8444986, 0.2966736, 0.9304149, 0…
$ PROX_MRT <dbl> 1.0731215, 0.8245176, 0.4544926, 0.9503956, 0…
$ WITHIN_350M_KINDERGARTEN <dbl> 1, 0, 1, 1, 1, 0, 1, 1, 0, 0, 1, 1, 1, 1, 1, …
$ WITHIN_350M_CHILDCARE <dbl> 2, 3, 3, 3, 3, 2, 6, 3, 3, 3, 3, 3, 5, 2, 3, …
$ WITHIN_350M_BUS <dbl> 4, 7, 10, 4, 8, 2, 8, 7, 6, 7, 7, 7, 8, 8, 11…
$ WITHIN_1KM_PRIMARY <dbl> 1, 3, 2, 3, 2, 2, 3, 2, 3, 3, 1, 2, 3, 2, 2, …
4.2 Statistical Graphics
4.2.1 Histogram - Distribution of 4-Room Resale Prices
ggplot(data = resale_main_sf, aes(x = `PRICE`)) +
geom_histogram(bins = 20, color = "black", fill = "light green") +
labs(title = "Distribution of 4-Room Resale Prices",
x = "Resale Prices",
y = "Frequency")
4.2.2 Boxplots - Distribution of 4-Room Resale Prices
ggplot(data = resale_main_sf, aes(x = '', y = PRICE)) +
geom_boxplot() +
labs(x = '', y = 'Resale Prices')
summary(resale_main_sf$PRICE)
Min. 1st Qu. Median Mean 3rd Qu. Max.
250000 445000 495000 529142 570000 1370000
4.2.3 Multiple Histogram Plots Distribution of Locational Factors
<- ggplot(data = resale_main_sf, aes(x = `AREA_SQM`)) +
AREA_SQM geom_histogram(bins = 20, color = "black", fill = "light green")
<- ggplot(data = resale_main_sf, aes(x = `LEASE_MTHS`)) +
LEASE_MTHS geom_histogram(bins = 20, color = "black", fill = "light green")
<- ggplot(data = resale_main_sf, aes(x = `PROX_CBD`)) +
PROX_CBD geom_histogram(bins = 20, color = "black", fill = "light green")
<- ggplot(data = resale_main_sf, aes(x = `PROX_BUS`)) +
PROX_BUS geom_histogram(bins = 20, color = "black", fill = "light green")
<- ggplot(data = resale_main_sf, aes(x = `PROX_CHILDCARE`)) +
PROX_CHILDCARE geom_histogram(bins = 20, color = "black", fill = "light green")
<- ggplot(data = resale_main_sf, aes(x = `PROX_ELDERCARE`)) +
PROX_ELDERCARE geom_histogram(bins = 20, color = "black", fill = "light green")
<- ggplot(data = resale_main_sf, aes(x = `PROX_HAWKER`)) +
PROX_HAWKER geom_histogram(bins = 20, color = "black", fill = "light green")
<- ggplot(data = resale_main_sf, aes(x = `PROX_GOODPRIMARY`)) +
PROX_GOODPRIMARY geom_histogram(bins = 20, color = "black", fill = "light green")
<- ggplot(data = resale_main_sf, aes(x = `PROX_PARK`)) +
PROX_PARK geom_histogram(bins = 20, color = "black", fill = "light green")
<- ggplot(data = resale_main_sf, aes(x = `PROX_SUPERMARKET`)) +
PROX_SUPERMARKET geom_histogram(bins = 20, color = "black", fill = "light green")
<- ggplot(data = resale_main_sf, aes(x = `PROX_SHOPPING`)) +
PROX_SHOPPING geom_histogram(bins = 20, color = "black", fill = "light green")
<- ggplot(data = resale_main_sf, aes(x = `PROX_MRT`)) +
PROX_MRT geom_histogram(bins = 20, color = "black", fill = "light green")
ggarrange(AREA_SQM, LEASE_MTHS, PROX_CBD, PROX_BUS, PROX_CHILDCARE, PROX_ELDERCARE, PROX_HAWKER, PROX_GOODPRIMARY, PROX_PARK, PROX_SUPERMARKET, PROX_SHOPPING, PROX_MRT, ncol = 3, nrow = 4)
4.2.3 Multiple Histogram Plots Distribution of Locational Factors With Radius
<- ggplot(data = resale_main_sf, aes(x = `WITHIN_350M_KINDERGARTEN`)) +
WITHIN_350M_KINDERGARTEN geom_histogram(bins = 20, color = "black", fill = "light green")
<- ggplot(data = resale_main_sf, aes(x = `WITHIN_350M_CHILDCARE`)) +
WITHIN_350M_CHILDCARE geom_histogram(bins = 20, color = "black", fill = "light green")
<- ggplot(data = resale_main_sf, aes(x = `WITHIN_350M_BUS`)) +
WITHIN_350M_BUS geom_histogram(bins = 20, color = "black", fill = "light green")
<- ggplot(data = resale_main_sf, aes(x = `WITHIN_1KM_PRIMARY`)) +
WITHIN_1KM_PRIMARY geom_histogram(bins = 20, color = "black", fill = "light green")
ggarrange(WITHIN_350M_KINDERGARTEN, WITHIN_350M_CHILDCARE, WITHIN_350M_BUS, WITHIN_1KM_PRIMARY, ncol = 2, nrow = 2)
4.2.4 Point Map
tmap_mode("view")
tmap_options(check.and.fix = TRUE)
tm_shape(resale_main_sf)+
tm_dots(col = "PRICE",
alpha = 0.6,
style = "quantile",
popup.vars=c("block"="block", "street_name"="street_name", "flat_model" = "flat_model", "town" = "town", "PRICE" = "PRICE", "LEASE_MTHS", "LEASE_MTHS")) +
tm_view(set.zoom.limits = c(11, 16))