library(readr)
all.flats <- read.csv('all.flats.csv') # upload data
str(all.flats)
## 'data.frame': 1649 obs. of 7 variables:
## $ price : num 383928 338000 299000 490000 220000 ...
## $ isNew : Factor w/ 2 levels "False","True": 1 2 2 2 2 2 1 2 1 2 ...
## $ rooms : int 3 3 2 4 2 2 3 3 2 3 ...
## $ floor : int 3 1 0 0 4 0 2 1 5 1 ...
## $ location : Factor w/ 43 levels "Antoninek","Bonin",..: 28 12 11 22 11 25 3 25 41 30 ...
## $ sqrMeters : int 46 37 51 40 43 70 78 76 54 66 ...
## $ location.short: int 2 1 1 4 1 3 1 3 2 2 ...
fmla.abs <- price ~ sqrMeters # Write the formula for the base model
model.abs <- lm(fmla.abs, all.flats) # Fit the base model
summary(model.abs)
##
## Call:
## lm(formula = fmla.abs, data = all.flats)
##
## Residuals:
## Min 1Q Median 3Q Max
## -412694 -65070 -25035 33965 973129
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 164920.3 10068.3 16.38 <2e-16 ***
## sqrMeters 3216.3 155.5 20.69 <2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 129000 on 1647 degrees of freedom
## Multiple R-squared: 0.2063, Adjusted R-squared: 0.2058
## F-statistic: 428 on 1 and 1647 DF, p-value: < 2.2e-16
# alternatively use 'glance' to view the results in a data frame format
library(broom)
glance(model.abs)
## r.squared adj.r.squared sigma statistic p.value df logLik
## 1 0.2062576 0.2057757 129005.4 427.9805 1.050628e-84 2 -21743.62
## AIC BIC deviance df.residual
## 1 43493.23 43509.46 2.741002e+13 1647
fmla.log <- log(price) ~ sqrMeters # Write the formula for the log.price model
model.log <- lm(fmla.log, all.flats) # Fit the model log-transformed 'price'
glance(model.log) # or summary(model.log)
## r.squared adj.r.squared sigma statistic p.value df logLik
## 1 0.2119147 0.2114362 0.2937949 442.8752 2.867697e-87 2 -319.0126
## AIC BIC deviance df.residual
## 1 644.0252 660.249 142.1615 1647
fmla.sqr <- price ~ I(sqrMeters^2) # Write the formula for the log.price model
model.sqr <- lm(fmla.sqr, all.flats) # Fit the model log-transformed 'price'
glance(model.sqr) # or summary(model.sqr)
## r.squared adj.r.squared sigma statistic p.value df logLik
## 1 0.2136397 0.2131623 128404.1 447.4598 4.699891e-88 2 -21735.91
## AIC BIC deviance df.residual
## 1 43477.83 43494.05 2.71551e+13 1647
library(dplyr)
library(tidyr)
library(ggplot2)
all.flats <- all.flats %>% # add columns with prediction results to all.flats data frame
mutate(pred.abs = predict(model.abs), error.abs = price - pred.abs,
rel.error.abs = ((pred.abs/price) - 1),
pred.log = exp(predict(model.log)), error.log = price - pred.log,
rel.error.log = ((pred.log/price) - 1),
pred.sqr = predict(model.sqr), error.sqr = price - pred.sqr,
rel.error.sqr = ((pred.sqr/price) - 1))
# compare performance of the three models based on their predictions on the full dataset
# create data frame containing prediction results
error <- all.flats %>% select(error.abs, error.log, error.sqr) %>% # for 'raw' y
gather(key=ModelType, value = error, error.abs, error.log, error.sqr)
error$ModelType <- as.factor(error$ModelType)
levels(error$ModelType) <- c('error.abs' = 'abs.model', 'error.log' = 'log.model', 'error.sqr' = 'sqr.model')
rel.error <- all.flats %>% # for log-transformed y
select(rel.error.abs, rel.error.log, rel.error.sqr) %>%
gather(key=ModelType, value = rel.error, rel.error.abs, rel.error.log, rel.error.sqr)
rel.error$ModelType <- as.factor(rel.error$ModelType)
levels(rel.error$ModelType) <- c('error.abs' = 'abs.model', 'error.log' = 'log.model', 'error.sqr' = 'sqr.model')
results <- bind_cols(error, rel.error)
results %>% select(ModelType, error, rel.error) %>% group_by(ModelType) %>%
summarise(RMSE = sqrt(mean(error^2)), relative.RMSE = sqrt(mean(rel.error^2)),
Rsqrt = 1 - (var(error)/var(all.flats$price)))
## # A tibble: 3 x 4
## ModelType RMSE relative.RMSE Rsqrt
## <fct> <dbl> <dbl> <dbl>
## 1 abs.model 128927. 0.338 0.206
## 2 log.model 129715. 0.311 0.210
## 3 sqr.model 128326. 0.339 0.214
all.flats %>% gather(key=ModelType, value=pred, pred.abs, pred.log, pred.sqr) %>%
ggplot(aes(x=sqrMeters)) + geom_point(aes(y=price)) +
geom_line(aes(y = pred, color = ModelType), size=1) +
scale_color_brewer(palette = "Dark2")
The results depicted in the plot can not be unequivocally interpreted in terms of preference of the one or the other model. The numeric results show that the log transformation of the response variable has brought a minimal improvement as indicated by higher R squared and lower relative RMSE.
In order to estimate out-of-sample error when predicting prices on a new data, we apply cross-validation using two different R packages: vtreat and caret.
library(vtreat)
set.seed(1215) # set the seed for reproducibility
splitPlan <- kWayCrossValidation(nrow(all.flats), 10, NULL, NULL)
# Get cross-val predictions for the base model: price ~ sqrMeter
all.flats$pred.abs.cv <- 0 # initialize the prediction vector
for(i in 1:10) {
split <- splitPlan[[i]]
model.abs.cv <- lm(fmla.abs, data = all.flats[split$train,])
all.flats$pred.abs.cv[split$app] <- predict(model.abs.cv,
newdata = all.flats[split$app,])
}
# Get cross-val predictions for the log model: log(price) ~ sqrMeter
all.flats$pred.log.cv <- 0
for(i in 1:10) {
split <- splitPlan[[i]]
model.log.cv <- lm(fmla.log, data = all.flats[split$train,])
all.flats$pred.log.cv[split$app] <- exp(predict(model.log.cv,
newdata = all.flats[split$app,]))
}
# Get cross-val predictions for the sqr model: price ~ sqrMeter^2
all.flats$pred.sqr.cv <- 0
for(i in 1:10) {
split <- splitPlan[[i]]
model.sqr.cv <- lm(fmla.sqr, data = all.flats[split$train,])
all.flats$pred.sqr.cv[split$app] <- predict(model.sqr.cv,
newdata = all.flats[split$app,])
}
# Gather the predictions and calculate the residuals
all.flats %>%
gather(key=ModelTestCV, value=pred.cv, pred.abs.cv, pred.log.cv, pred.sqr.cv) %>%
mutate(error = pred.cv - price, rel.error = (pred.cv - price)/price) %>%
group_by(ModelTestCV) %>% # the relative error is calculated to compare the models
summarise(rmse = sqrt(mean(error^2)), rel.rmse=sqrt(mean(rel.error^2)))
## # A tibble: 3 x 3
## ModelTestCV rmse rel.rmse
## <chr> <dbl> <dbl>
## 1 pred.abs.cv 129218. 0.339
## 2 pred.log.cv 129997. 0.312
## 3 pred.sqr.cv 128691. 0.341
library(caret)
set.seed(1215)
# Fit the base model
model.abs <- train(price ~ sqrMeters, all.flats, method="lm",
trControl = trainControl(method = 'cv', number = 10,
savePredictions = 'all',
returnResamp = 'all'))
model.abs # CV test results
## Linear Regression
##
## 1649 samples
## 1 predictor
##
## No pre-processing
## Resampling: Cross-Validated (10 fold)
## Summary of sample sizes: 1485, 1484, 1484, 1484, 1484, 1484, ...
## Resampling results:
##
## RMSE Rsquared MAE
## 127579 0.2125172 84684.12
##
## Tuning parameter 'intercept' was held constant at a value of TRUE
# the full list containing all possible infos about cross-validation and model fitting results can be viewed using str(model.abs) command; the element of the list generated by this command and labeled 'finalModel' provides prediction results using the fitted model
summary(model.abs) # model fitting results
##
## Call:
## lm(formula = .outcome ~ ., data = dat)
##
## Residuals:
## Min 1Q Median 3Q Max
## -412694 -65070 -25035 33965 973129
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 164920.3 10068.3 16.38 <2e-16 ***
## sqrMeters 3216.3 155.5 20.69 <2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 129000 on 1647 degrees of freedom
## Multiple R-squared: 0.2063, Adjusted R-squared: 0.2058
## F-statistic: 428 on 1 and 1647 DF, p-value: < 2.2e-16
# create data frame with information required for calculation of model fitting metrics
fit.abs <- model.abs$finalModel$fitted.values %>% as.numeric() %>% as.data.frame()
price <- model.abs$trainingData$.outcome %>% as.numeric() %>% as.data.frame()
p.abs <- bind_cols(fit.abs, price)
colnames(p.abs) <- c('fitted', 'outcome')
p.abs <- p.abs %>% mutate(error = outcome - fitted, rel.error = (fitted/outcome) - 1, ModelType = 'model.abs') %>% select(ModelType, error, rel.error)
# Fit the log-transformed model
model.log <- train(log(price) ~ sqrMeters, all.flats, method="lm",
trControl = trainControl(method = 'cv', number = 10,
savePredictions = 'all',
returnResamp = 'all'))
model.log
## Linear Regression
##
## 1649 samples
## 1 predictor
##
## No pre-processing
## Resampling: Cross-Validated (10 fold)
## Summary of sample sizes: 1483, 1484, 1484, 1484, 1485, 1485, ...
## Resampling results:
##
## RMSE Rsquared MAE
## 0.2929052 0.216426 0.2146888
##
## Tuning parameter 'intercept' was held constant at a value of TRUE
summary(model.log)
##
## Call:
## lm(formula = .outcome ~ ., data = dat)
##
## Residuals:
## Min 1Q Median 3Q Max
## -1.29166 -0.16962 -0.02177 0.13597 1.39913
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 1.228e+01 2.293e-02 535.69 <2e-16 ***
## sqrMeters 7.451e-03 3.541e-04 21.05 <2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 0.2938 on 1647 degrees of freedom
## Multiple R-squared: 0.2119, Adjusted R-squared: 0.2114
## F-statistic: 442.9 on 1 and 1647 DF, p-value: < 2.2e-16
# create data frame with information required for calculation of model fitting metrics
fit.log <- model.log$finalModel$fitted.values %>% as.numeric() %>% exp() %>% as.data.frame()
price <- model.log$trainingData$.outcome %>% as.numeric() %>% as.data.frame()
p.log <- bind_cols(fit.log, price)
colnames(p.log) <- c('fitted', 'outcome')
p.log <- p.log %>% mutate(error = outcome - fitted, rel.error = (fitted/outcome) - 1, ModelType = 'model.log') %>% select(ModelType, error, rel.error)
# a model of price as a function of squared size
sqrt.model <- train(price ~ I(sqrMeters^2), all.flats, method = "lm",
trControl = trainControl(method = "cv", number = 10,
savePredictions = 'all',
returnResamp = 'all'))
# inspect the results
sqrt.model # CV test results
## Linear Regression
##
## 1649 samples
## 1 predictor
##
## No pre-processing
## Resampling: Cross-Validated (10 fold)
## Summary of sample sizes: 1483, 1484, 1485, 1483, 1483, 1485, ...
## Resampling results:
##
## RMSE Rsquared MAE
## 127969.4 0.2141595 83627.09
##
## Tuning parameter 'intercept' was held constant at a value of TRUE
summary(sqrt.model) # model fitting results
##
## Call:
## lm(formula = .outcome ~ ., data = dat)
##
## Residuals:
## Min 1Q Median 3Q Max
## -480091 -67145 -20478 31749 985163
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 2.701e+05 5.395e+03 50.07 <2e-16 ***
## `I(sqrMeters^2)` 2.205e+01 1.042e+00 21.15 <2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 128400 on 1647 degrees of freedom
## Multiple R-squared: 0.2136, Adjusted R-squared: 0.2132
## F-statistic: 447.5 on 1 and 1647 DF, p-value: < 2.2e-16
# create data frame with information required for calculation of model fitting metrics
fit.sqrt <- sqrt.model$finalModel$fitted.values %>% as.numeric() %>% as.data.frame()
price <- sqrt.model$trainingData$.outcome %>% as.numeric() %>% as.data.frame()
p.sqrt <- bind_cols(fit.sqrt, price)
colnames(p.sqrt) <- c('fitted', 'outcome')
p.sqrt <- p.sqrt %>% mutate(error = outcome - fitted, rel.error = (fitted/outcome) - 1, ModelType = 'model.sqrt') %>% select(ModelType, error, rel.error)
# binding data frames with prediction results using 'raw' and 'log-transformed' models
results <- bind_rows(p.abs, p.log, p.sqrt)
results %>% group_by(ModelType) %>% # comparison of the two models
summarise(RMSE = sqrt(mean(error^2)), relative.RMSE = sqrt(mean(rel.error^2)),
Rsqrt = 1 - (var(error)/var(all.flats$price)))
## # A tibble: 3 x 4
## ModelType RMSE relative.RMSE Rsqrt
## <chr> <dbl> <dbl> <dbl>
## 1 model.abs 128927. 0.338 0.206
## 2 model.log 129715. 0.311 0.210
## 3 model.sqrt 128326. 0.339 0.214