library(readr)
library(dplyr)
# upload the training data set
flats <- read.delim('train.tsv', header=FALSE)
# add column names
colnames(flats) <- c('price', 'isNew', 'rooms', 'floor', 'location', 'sqrMeters')
# examine column with 'locations'
levels(flats$location)
# convert the column containing colations into a factor with 5 levels
flats$location <- as.character(flats$location)
location <- select(flats, location)
location2 <- unlist(location)
location2 %>% class() %>% is.vector()
location3 <- factor(location2)
# code location labels into 5 categories
levels(location3) <-
c(4,2,1,1,2,3,1,1,5,3,1,1,5,2,5,2,4,1,2,4,5,4,3,3,3,3,5,4,2,4,2,1,3,4,5,5,2,4,3,1,3,2,3,3)
location4 <- as.character(location3)
location.short <- as.integer(location4)
location.short <- factor(location.short, ordered=FALSE, levels=c("1", "2", "3", "4", "5"))
location.short <- as.data.frame(location.short)
Append column with coded location values (variable location.short) to a new data frame (flats.mlm) containing all the data:
flats.mlm <- bind_cols(flats, location.short)
#shuffle rows
library(caret)
rows <- sample(nrow(flats.mlm))
flats.mlm <- flats.mlm[rows, ]
# view the structure
str(flats.mlm)
## 'data.frame': 1674 obs. of 7 variables:
## $ price : num 352390 299000 339500 312259 333024 ...
## $ isNew : Factor w/ 2 levels "False","True": 1 2 2 1 2 2 2 2 1 2 ...
## $ rooms : int 2 2 3 2 2 2 3 3 4 2 ...
## $ floor : int 12 0 7 6 1 0 3 4 14 4 ...
## $ location : chr "Grunwald" "Piątkowo" "Nowe" "Winogrady" ...
## $ sqrMeters : int 80 20 80 60 45 38 53 68 73 33 ...
## $ location.short: Factor w/ 5 levels "1","2","3","4",..: 1 3 3 2 1 1 2 1 1 1 ...
Filter out outliers and ureliable data points:
# cleaning the data according to the procedure applied in the previous study on predicting the price of a flat using simple linear regression
flats.mlm <- filter(flats.mlm, price < 1500000, rooms < 6, !(sqrMeters <= 20), !(sqrMeters <= 30 & rooms == 3), !(sqrMeters <= 38 & rooms == 4), !(sqrMeters <= 44 & rooms == 5))
# write to a csv file
library(readr)
write_csv(flats.mlm, 'flatsCodedLocations.csv')
library(ggplot2)
# find out if there might be an association between price and floor depending on whether the flat is new or old
ggplot(flats.mlm, aes(x=floor, y=price, group=floor)) +
geom_boxplot() + facet_wrap(~isNew) +
scale_x_continuous(breaks = seq(min(0), max(16), by = 1)) +
scale_y_continuous(breaks = seq(min(0), max(1.5e+06), by = 2.5e+05))
# find out if there might be an association between price and status of the flat (new or old/used)
ggplot(flats.mlm, aes(x=isNew, y=price)) + geom_boxplot() +
scale_y_continuous(breaks = seq(min(0), max(1.5e+06), by = 2.5e+05))
# find out if there might be an association between price and location depending on whether the flat is new or old/used
ggplot(flats.mlm, aes(x=location.short, y=price)) +
geom_boxplot() + facet_wrap(~isNew) +
scale_y_continuous(breaks = seq(min(0), max(1.5e+06), by = 2.5e+05))
Generally, it can be seen that the new flats (-> isNew=True) localized in the city center (class 1) have on average somewhat lower prices than the old flats (-> isNew=False), but at the same time for the two groups there may be a similar spread of prices. The greatest differences in terms of average price and spread of prices can be observed between new versus old flats grouped in the classes 4 and 5, where new flats tend to be on average more expensive than the used ones. This indicates that while the isNew variable might not play a role in predicting flats’ prices, the interaction between isNew and location.short variables may increase model performance. A similar assumption can be drawn as concerns the interaction between isNew and floor variables (see above). However, it should also be noticed that the data contains a lot of noise and the patterns of associations are not very systematic.
null.model <- lm(price ~ 1, flats.mlm)
summary(null.model)
##
## Call:
## lm(formula = price ~ 1, data = flats.mlm)
##
## Residuals:
## Min 1Q Median 3Q Max
## -262040 -83993 -33040 35960 1004370
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 362040 3698 97.89 <2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 144200 on 1520 degrees of freedom
# fit the model
model.A <- lm(price ~ isNew + rooms + floor + sqrMeters + location.short, flats.mlm)
# summary of model fitting results
library(broom)
glance(model.A)
## r.squared adj.r.squared sigma statistic p.value df logLik
## 1 0.3920329 0.3888161 112757.4 121.8721 1.806504e-157 9 -19847.48
## AIC BIC deviance df.residual
## 1 39714.95 39768.22 1.922392e+13 1512
# view coding of the location.short factor
contrasts(flats.mlm$location.short)
## 2 3 4 5
## 1 0 0 0 0
## 2 1 0 0 0
## 3 0 1 0 0
## 4 0 0 1 0
## 5 0 0 0 1
# fit the model
model.B <- lm(log(price) ~ sqrMeters + rooms + location.short + location.short:isNew, flats.mlm)
# the interaction term 'floor:isNew' appeared to be insignificant during model fitting trials and consequently it was removed from the final model; the model with log(price) was selected due to higher R-squared
# summary of model fitting results
glance(model.B)
## r.squared adj.r.squared sigma statistic p.value df logLik
## 1 0.455555 0.4515863 0.2453002 114.7845 3.030601e-190 12 -14.76218
## AIC BIC deviance df.residual
## 1 55.52436 124.777 90.79983 1509
# predict flats' prices with the model.A on the test set
p2 <- predict(model.A, flats.test)
error2 <- p2 - flats.test$exp.price
rmse2 <- round(x = sqrt(mean(error2^2)), digits = 2)
row2 <- c('model.A', rmse2)
# predict flats' prices with the model.B on the test set
p3 <- exp(predict(model.B, flats.test))
error3 <- p3 - flats.test$exp.price
rmse3 <- round(x = sqrt(mean(error3^2)), digits = 2)
row3 <- c('model.B', rmse3)
# generate and summarise predictions using the null model
p1 <- predict(null.model, flats.test) # testing the null model
error1 <- p1 - flats.test$exp.price
rmse1 <- round(x = sqrt(mean(error1^2)), digits = 2)
row1 <- c('null.model', rmse1)
# combine results into a single data frame
results <- as.data.frame(rbind(row1, row2, row3))
colnames(results) <- c('MODEL', 'RMSE')
rownames(results) <- c('1', '2', '3')
results # print prediction results summary
## MODEL RMSE
## 1 null.model 150859.77
## 2 model.A 187768.15
## 3 model.B 180309.48
Discussion
The multiple regression models presented here outperform simple regression models in predicting the price of flats. Both the model A without interaction terms and the model B including interactions and predicting log prices had higher R squared (indicating greater explanatory power) and lower RMSE (indicating lower error), but the results of predicting flats’ prices on the test data showed that the two models, just like the simple regression models, produced higher RMSE than the null model. This might be attributed to the lack of associations between the response and explanatory variables in the test set. Therefore, the out-of-sample errors will be estimated using the full dataset (i.e. combined train and test sets) in a cross-validation procedure.
It is also worth to mention that different (more efficient and data-based) handling of the location labels could possibly contribute to decrease the overall prediction error.
# change colnames so that the two datasets contain the same columns
colnames(flats.test) <- c('price', 'isNew', 'rooms', 'floor', 'location', 'sqrMeters', 'location.short')
allFlats <- bind_rows(flats.mlm, flats.test)
# shuffle rows
rows <- sample(nrow(allFlats))
allFlats <- allFlats[rows, ]
# view the combined dataset
str(allFlats)
## 'data.frame': 1649 obs. of 7 variables:
## $ price : num 341311 389000 311515 599000 576500 ...
## $ isNew : Factor w/ 2 levels "False","True": 2 2 1 2 2 2 2 2 2 2 ...
## $ rooms : int 3 3 2 3 2 2 3 2 3 3 ...
## $ floor : int 8 2 4 3 5 8 2 3 2 7 ...
## $ location : chr "Grunwald" "Jeżyce" "Grunwald" "Grunwald" ...
## $ sqrMeters : int 53 50 49 60 50 37 113 49 82 63 ...
## $ location.short: Factor w/ 5 levels "1","2","3","4",..: 1 1 1 1 1 3 1 2 1 2 ...
library(caret)
set.seed(1215)
# Fit and test the model
model.C <- train(log(price) ~ sqrMeters + rooms + location.short + location.short:isNew, allFlats, method="lm", trControl = trainControl(method = 'cv', number = 10, savePredictions = 'all', returnResamp = 'all'))
model.C # CV test results
## Linear Regression
##
## 1649 samples
## 4 predictor
##
## No pre-processing
## Resampling: Cross-Validated (10 fold)
## Summary of sample sizes: 1485, 1484, 1484, 1484, 1484, 1484, ...
## Resampling results:
##
## RMSE Rsquared MAE
## 0.261627 0.3800203 0.188196
##
## Tuning parameter 'intercept' was held constant at a value of TRUE
summary(model.C) # model fitting results
##
## Call:
## lm(formula = .outcome ~ ., data = dat)
##
## Residuals:
## Min 1Q Median 3Q Max
## -1.36768 -0.14830 -0.02688 0.11751 1.22636
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 12.1451144 0.0330455 367.527 < 2e-16 ***
## sqrMeters 0.0042265 0.0003673 11.508 < 2e-16 ***
## rooms 0.1737261 0.0099782 17.411 < 2e-16 ***
## location.short2 -0.1851866 0.0337048 -5.494 4.54e-08 ***
## location.short3 -0.1650501 0.0355897 -4.638 3.81e-06 ***
## location.short4 -0.2412183 0.0473855 -5.091 3.98e-07 ***
## location.short5 -0.4471123 0.0566821 -7.888 5.57e-15 ***
## `location.short1:isNewTrue` -0.0815379 0.0244157 -3.340 0.000858 ***
## `location.short2:isNewTrue` 0.0179622 0.0297399 0.604 0.545945
## `location.short3:isNewTrue` -0.0249122 0.0328827 -0.758 0.448795
## `location.short4:isNewTrue` 0.1785637 0.0506869 3.523 0.000439 ***
## `location.short5:isNewTrue` 0.3676688 0.0657298 5.594 2.60e-08 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 0.2613 on 1637 degrees of freedom
## Multiple R-squared: 0.3803, Adjusted R-squared: 0.3761
## F-statistic: 91.33 on 11 and 1637 DF, p-value: < 2.2e-16
# create data frame with information required for calculation of model fitting metrics
fit.log <- model.C$finalModel$fitted.values %>% as.numeric() %>% exp() %>% as.data.frame()
price <- model.C$trainingData$.outcome %>% as.numeric() %>% as.data.frame()
p.log <- bind_cols(fit.log, price)
colnames(p.log) <- c('fitted', 'outcome')
p.log <- p.log %>% mutate(error = outcome - fitted)
# calculate and print RMSE
(RMSE = sqrt(mean(p.log$error^2)))
## [1] 118133.1