library(glmnet)
library(caret)
library(gbm)
data <- read.csv("Ames_data.csv", stringsAsFactors = FALSE)
testIDs <- read.table("project1_testIDs.dat", stringsAsFactors = FALSE)
start_time <- Sys.time()
for(j in 1:10){
train <- data[-testIDs[,j], ]
test <- data[testIDs[,j], ]
test.y <- test[, c(1, 83)]
test <- test[, -83]
write.csv(train,"train.csv",row.names=FALSE)
write.csv(test, "test.csv",row.names=FALSE)
write.csv(test.y,"test_y.csv",row.names=FALSE)
train <- read.csv("train.csv", stringsAsFactors = FALSE)
test <- read.csv("test.csv", stringsAsFactors = FALSE)
set.seed(1093)
winsor.vars = c("Lot_Frontage", "Lot_Area", "Mas_Vnr_Area", "BsmtFin_SF_2", "Bsmt_Unf_SF", "Total_Bsmt_SF", "Second_Flr_SF", 'First_Flr_SF', "Gr_Liv_Area", "Garage_Area", "Wood_Deck_SF", "Open_Porch_SF", "Enclosed_Porch", "Three_season_porch", "Screen_Porch", "Misc_Val")
remove.var = c('Street', 'Utilities', 'Condition_2', 'Roof_Matl', 'Heating', 'Pool_QC', 'Misc_Feature', 'Low_Qual_Fin_SF', 'Pool_Area', 'Longitude','Latitude')
na_replace = function(x){if(is.na(x)){return(0)} else{return(x)}}
quan.value = 0.95
###########################################
# Step 1: Preprocess training data
# and fit two models
#
train <- read.csv("train.csv", stringsAsFactors = FALSE)
#
# YOUR CODE
#
# GBM
gbm_train = train
gbm_train[sapply(gbm_train, is.character)] = lapply(gbm_train[sapply(gbm_train, is.character)], as.factor)
n = nrow(gbm_train)
gbm_fit = gbm(log(Sale_Price) ~ ., data = gbm_train[,-which(names(gbm_train) == "PID" )],
distribution = "gaussian",
n.trees = 500,
shrinkage = 0.1,
interaction.depth = 5,
bag.fraction = 1,
cv.folds = 5)
# Elastic Net
# Remove Vars
en_train = train
en_train = en_train[, !names(en_train) %in% remove.var]
# Character column to one hot encoding
en_train[sapply(en_train, is.character)] = lapply(en_train[sapply(en_train, is.character)], as.factor)
train_factors = en_train
dummy = dummyVars("~.", data = en_train[sapply(en_train, is.factor)])
train_one_hot_encodings = data.frame(predict(dummy, newdata=en_train[sapply(en_train, is.factor)]))
en_train = cbind(en_train, train_one_hot_encodings)
en_train = en_train[,!names(en_train) %in% names(en_train[sapply(en_train, is.factor)])]
# Replace NA
en_train[,which(names(en_train) == "Garage_Yr_Blt")] = sapply(en_train[,which(names(en_train) == "Garage_Yr_Blt")],na_replace)
# Winsorize
for(var in winsor.vars){
tmp = en_train[, var]
myquan = quantile(tmp, probs = quan.value, na.rm = TRUE)
tmp[tmp > myquan] <- myquan
en_train[, var] = tmp
}
# Convert to train matrix
X.train = as.matrix(en_train[, !names(en_train) %in% c('PID','Sale_Price')])
Y.train = as.matrix(en_train[, names(en_train) %in% c('Sale_Price')])
# Elastic Net Model
cv.out = cv.glmnet(X.train, log(Y.train), alpha = 0.2)
best.lam = cv.out$lambda.min
###########################################
# Step 2: Preprocess test data
# and output predictions into two files
#
test <- read.csv("test.csv", stringsAsFactors = FALSE)
#
# YOUR CODE
#
# GBM
gbm_test = test
for(i in 1:ncol(gbm_test)){
if(is.character(gbm_test[,i])){
gbm_test[,i] = factor(gbm_test[,i], levels = levels(gbm_train[,i]))
}
}
gbm_test.y.pred = exp(predict(gbm_fit, newdata=gbm_test))
gbm_test.y.pred = data.frame(PID = test[,1], Sale_Price = gbm_test.y.pred)
write.csv(gbm_test.y.pred, file = paste("mysubmission_gbm_",j,".txt", sep=""), quote= FALSE, row.names = FALSE)
# Elastic Net
# Remove vars
en_test = test
en_test = en_test[, !names(en_test) %in% remove.var]
# Character column to one hot encoding
for(i in 1:ncol(en_test)){
if(is.character(en_test[,i])){
en_test[,i] = factor(en_test[,i], levels = levels(train_factors[,i]))
}
}
test_one_hot_encodings = data.frame(predict(dummy, newdata=en_test[sapply(en_test, is.factor)]))
en_test = cbind(en_test, test_one_hot_encodings)
en_test = en_test[,!names(en_test) %in% names(en_test[sapply(en_test, is.factor)])]
# Replace NA
en_test[,which(names(en_test) == "Garage_Yr_Blt")] = sapply(en_test[,which(names(en_test) == "Garage_Yr_Blt")],na_replace)
na_values_matrix = which(is.na(en_test), arr.ind=TRUE)
for(i in 1:nrow(na_values_matrix)){
en_test[na_values_matrix[i,1],na_values_matrix[i,2]] = 0
}
# Winsorize
for(var in winsor.vars){
tmp = en_test[, var]
myquan = quantile(tmp, probs = quan.value, na.rm = TRUE)
tmp[tmp > myquan] <- myquan
en_test[, var] = tmp
}
# Convert to test matrix
X.test = as.matrix(en_test[,!names(en_test) %in% c('PID')])
# Predict
en_test.y.pred = exp(predict(cv.out, s = best.lam, newx=X.test))
en_test.y.pred = data.frame(PID = en_test[,1], Sale_Price = en_test.y.pred[,1])
write.csv(en_test.y.pred, file = paste("mysubmission_en_",j,".txt", sep=""), quote= FALSE, row.names = FALSE)
}
end_time <- Sys.time()
end_time - start_time
for(j in 1:10){
gbm_pred <- read.csv(paste("mysubmission_gbm_",j,".txt", sep=""))
en_pred <- read.csv(paste("mysubmission_en_",j,".txt", sep=""))
test <- data[testIDs[,j], ]
test.y <- test[, c(1, 83)]
names(test.y)[2] <- "True_Sale_Price"
gbm_pred <- merge(gbm_pred, test.y, by="PID")
en_pred <- merge(en_pred, test.y, by="PID")
print(j)
print(sqrt(mean((log(gbm_pred$Sale_Price) - log(gbm_pred$True_Sale_Price))^2)))
print(sqrt(mean((log(en_pred$Sale_Price) - log(en_pred$True_Sale_Price))^2)))
}
LS0tDQp0aXRsZTogIlByb2plY3QgMTogUHJlZGljdCB0aGUgSG91c2luZyBQcmljZXMgaW4gQW1lcyINCmRhdGU6ICJTcHJpbmcgMjAyMSINCm91dHB1dDoNCiAgaHRtbF9ub3RlYm9vazoNCiAgICB0aGVtZTogcmVhZGFibGUNCiAgICB0b2M6IFRSVUUNCiAgICB0b2NfZmxvYXQ6IFRSVUUNCi0tLQ0KDQoNCg0KYGBge3J9DQpsaWJyYXJ5KGdsbW5ldCkNCmxpYnJhcnkoY2FyZXQpDQpsaWJyYXJ5KGdibSkNCg0KZGF0YSA8LSByZWFkLmNzdigiQW1lc19kYXRhLmNzdiIsIHN0cmluZ3NBc0ZhY3RvcnMgPSBGQUxTRSkNCnRlc3RJRHMgPC0gcmVhZC50YWJsZSgicHJvamVjdDFfdGVzdElEcy5kYXQiLCBzdHJpbmdzQXNGYWN0b3JzID0gRkFMU0UpDQoNCnN0YXJ0X3RpbWUgPC0gU3lzLnRpbWUoKQ0KZm9yKGogaW4gMToxMCl7DQogIA0KICB0cmFpbiA8LSBkYXRhWy10ZXN0SURzWyxqXSwgXQ0KICB0ZXN0IDwtIGRhdGFbdGVzdElEc1ssal0sIF0NCiAgdGVzdC55IDwtIHRlc3RbLCBjKDEsIDgzKV0NCiAgdGVzdCA8LSB0ZXN0WywgLTgzXQ0KICB3cml0ZS5jc3YodHJhaW4sInRyYWluLmNzdiIscm93Lm5hbWVzPUZBTFNFKQ0KICB3cml0ZS5jc3YodGVzdCwgInRlc3QuY3N2Iixyb3cubmFtZXM9RkFMU0UpDQogIHdyaXRlLmNzdih0ZXN0LnksInRlc3RfeS5jc3YiLHJvdy5uYW1lcz1GQUxTRSkNCiAgdHJhaW4gPC0gcmVhZC5jc3YoInRyYWluLmNzdiIsIHN0cmluZ3NBc0ZhY3RvcnMgPSBGQUxTRSkNCiAgdGVzdCA8LSByZWFkLmNzdigidGVzdC5jc3YiLCBzdHJpbmdzQXNGYWN0b3JzID0gRkFMU0UpDQogIA0KICBzZXQuc2VlZCgxMDkzKQ0KDQoNCg0Kd2luc29yLnZhcnMgPSBjKCJMb3RfRnJvbnRhZ2UiLCAiTG90X0FyZWEiLCAiTWFzX1Zucl9BcmVhIiwgIkJzbXRGaW5fU0ZfMiIsICJCc210X1VuZl9TRiIsICJUb3RhbF9Cc210X1NGIiwgIlNlY29uZF9GbHJfU0YiLCAnRmlyc3RfRmxyX1NGJywgIkdyX0xpdl9BcmVhIiwgIkdhcmFnZV9BcmVhIiwgIldvb2RfRGVja19TRiIsICJPcGVuX1BvcmNoX1NGIiwgIkVuY2xvc2VkX1BvcmNoIiwgIlRocmVlX3NlYXNvbl9wb3JjaCIsICJTY3JlZW5fUG9yY2giLCAiTWlzY19WYWwiKQ0KcmVtb3ZlLnZhciA9IGMoJ1N0cmVldCcsICdVdGlsaXRpZXMnLCAnQ29uZGl0aW9uXzInLCAnUm9vZl9NYXRsJywgJ0hlYXRpbmcnLCAnUG9vbF9RQycsICdNaXNjX0ZlYXR1cmUnLCAnTG93X1F1YWxfRmluX1NGJywgJ1Bvb2xfQXJlYScsICdMb25naXR1ZGUnLCdMYXRpdHVkZScpDQpuYV9yZXBsYWNlID0gZnVuY3Rpb24oeCl7aWYoaXMubmEoeCkpe3JldHVybigwKX0gZWxzZXtyZXR1cm4oeCl9fQ0KcXVhbi52YWx1ZSA9IDAuOTUNCg0KIyMjIyMjIyMjIyMjIyMjIyMjIyMjIyMjIyMjIyMjIyMjIyMjIyMjIyMjIw0KIyBTdGVwIDE6IFByZXByb2Nlc3MgdHJhaW5pbmcgZGF0YQ0KIyAgICAgICAgIGFuZCBmaXQgdHdvIG1vZGVscw0KIw0KdHJhaW4gPC0gcmVhZC5jc3YoInRyYWluLmNzdiIsIHN0cmluZ3NBc0ZhY3RvcnMgPSBGQUxTRSkNCiMNCiMgWU9VUiBDT0RFDQojIA0KDQojIEdCTQ0KZ2JtX3RyYWluID0gdHJhaW4NCmdibV90cmFpbltzYXBwbHkoZ2JtX3RyYWluLCBpcy5jaGFyYWN0ZXIpXSA9IGxhcHBseShnYm1fdHJhaW5bc2FwcGx5KGdibV90cmFpbiwgaXMuY2hhcmFjdGVyKV0sIGFzLmZhY3RvcikNCm4gPSBucm93KGdibV90cmFpbikNCmdibV9maXQgPSBnYm0obG9nKFNhbGVfUHJpY2UpIH4gLiwgZGF0YSA9IGdibV90cmFpblssLXdoaWNoKG5hbWVzKGdibV90cmFpbikgPT0gIlBJRCIgKV0sDQogICAgICAgICAgICAgZGlzdHJpYnV0aW9uID0gICJnYXVzc2lhbiIsDQogICAgICAgICAgICAgbi50cmVlcyA9IDUwMCwNCiAgICAgICAgICAgICBzaHJpbmthZ2UgPSAwLjEsDQogICAgICAgICAgICAgaW50ZXJhY3Rpb24uZGVwdGggPSA1LA0KICAgICAgICAgICAgIGJhZy5mcmFjdGlvbiA9IDEsDQogICAgICAgICAgICAgY3YuZm9sZHMgPSA1KQ0KDQojIEVsYXN0aWMgTmV0DQojIFJlbW92ZSBWYXJzDQplbl90cmFpbiA9IHRyYWluDQplbl90cmFpbiA9IGVuX3RyYWluWywgIW5hbWVzKGVuX3RyYWluKSAlaW4lIHJlbW92ZS52YXJdDQoNCiMgQ2hhcmFjdGVyIGNvbHVtbiB0byBvbmUgaG90IGVuY29kaW5nDQplbl90cmFpbltzYXBwbHkoZW5fdHJhaW4sIGlzLmNoYXJhY3RlcildID0gbGFwcGx5KGVuX3RyYWluW3NhcHBseShlbl90cmFpbiwgaXMuY2hhcmFjdGVyKV0sIGFzLmZhY3RvcikNCnRyYWluX2ZhY3RvcnMgPSBlbl90cmFpbg0KZHVtbXkgPSBkdW1teVZhcnMoIn4uIiwgZGF0YSA9IGVuX3RyYWluW3NhcHBseShlbl90cmFpbiwgaXMuZmFjdG9yKV0pDQp0cmFpbl9vbmVfaG90X2VuY29kaW5ncyA9IGRhdGEuZnJhbWUocHJlZGljdChkdW1teSwgbmV3ZGF0YT1lbl90cmFpbltzYXBwbHkoZW5fdHJhaW4sIGlzLmZhY3RvcildKSkNCmVuX3RyYWluID0gY2JpbmQoZW5fdHJhaW4sIHRyYWluX29uZV9ob3RfZW5jb2RpbmdzKQ0KZW5fdHJhaW4gPSBlbl90cmFpblssIW5hbWVzKGVuX3RyYWluKSAlaW4lIG5hbWVzKGVuX3RyYWluW3NhcHBseShlbl90cmFpbiwgaXMuZmFjdG9yKV0pXQ0KDQoNCiMgUmVwbGFjZSBOQQ0KZW5fdHJhaW5bLHdoaWNoKG5hbWVzKGVuX3RyYWluKSA9PSAiR2FyYWdlX1lyX0JsdCIpXSA9IHNhcHBseShlbl90cmFpblssd2hpY2gobmFtZXMoZW5fdHJhaW4pID09ICJHYXJhZ2VfWXJfQmx0IildLG5hX3JlcGxhY2UpDQoNCiMgV2luc29yaXplDQpmb3IodmFyIGluIHdpbnNvci52YXJzKXsNCiAgdG1wID0gZW5fdHJhaW5bLCB2YXJdDQogIG15cXVhbiA9IHF1YW50aWxlKHRtcCwgcHJvYnMgPSBxdWFuLnZhbHVlLCBuYS5ybSA9IFRSVUUpDQogIHRtcFt0bXAgPiBteXF1YW5dIDwtIG15cXVhbg0KICBlbl90cmFpblssIHZhcl0gPSB0bXANCn0NCg0KDQojIENvbnZlcnQgdG8gdHJhaW4gbWF0cml4DQpYLnRyYWluID0gYXMubWF0cml4KGVuX3RyYWluWywgIW5hbWVzKGVuX3RyYWluKSAlaW4lIGMoJ1BJRCcsJ1NhbGVfUHJpY2UnKV0pDQpZLnRyYWluID0gYXMubWF0cml4KGVuX3RyYWluWywgbmFtZXMoZW5fdHJhaW4pICVpbiUgYygnU2FsZV9QcmljZScpXSkNCg0KDQojIEVsYXN0aWMgTmV0IE1vZGVsDQpjdi5vdXQgPSBjdi5nbG1uZXQoWC50cmFpbiwgbG9nKFkudHJhaW4pLCBhbHBoYSA9IDAuMikNCmJlc3QubGFtID0gY3Yub3V0JGxhbWJkYS5taW4NCg0KIyMjIyMjIyMjIyMjIyMjIyMjIyMjIyMjIyMjIyMjIyMjIyMjIyMjIyMjIw0KIyBTdGVwIDI6IFByZXByb2Nlc3MgdGVzdCBkYXRhDQojICAgICAgICAgYW5kIG91dHB1dCBwcmVkaWN0aW9ucyBpbnRvIHR3byBmaWxlcw0KIw0KdGVzdCA8LSByZWFkLmNzdigidGVzdC5jc3YiLCBzdHJpbmdzQXNGYWN0b3JzID0gRkFMU0UpDQojDQojIFlPVVIgQ09ERQ0KIyANCg0KIyBHQk0NCmdibV90ZXN0ID0gdGVzdA0KZm9yKGkgaW4gMTpuY29sKGdibV90ZXN0KSl7DQogIGlmKGlzLmNoYXJhY3RlcihnYm1fdGVzdFssaV0pKXsNCiAgICBnYm1fdGVzdFssaV0gPSBmYWN0b3IoZ2JtX3Rlc3RbLGldLCBsZXZlbHMgPSBsZXZlbHMoZ2JtX3RyYWluWyxpXSkpDQogIH0NCn0NCg0KZ2JtX3Rlc3QueS5wcmVkID0gZXhwKHByZWRpY3QoZ2JtX2ZpdCwgbmV3ZGF0YT1nYm1fdGVzdCkpDQpnYm1fdGVzdC55LnByZWQgPSBkYXRhLmZyYW1lKFBJRCA9IHRlc3RbLDFdLCBTYWxlX1ByaWNlID0gZ2JtX3Rlc3QueS5wcmVkKQ0Kd3JpdGUuY3N2KGdibV90ZXN0LnkucHJlZCwgZmlsZSA9IHBhc3RlKCJteXN1Ym1pc3Npb25fZ2JtXyIsaiwiLnR4dCIsIHNlcD0iIiksICBxdW90ZT0gRkFMU0UsIHJvdy5uYW1lcyA9IEZBTFNFKQ0KDQojIEVsYXN0aWMgTmV0DQojIFJlbW92ZSB2YXJzDQplbl90ZXN0ID0gdGVzdA0KZW5fdGVzdCA9IGVuX3Rlc3RbLCAhbmFtZXMoZW5fdGVzdCkgJWluJSByZW1vdmUudmFyXQ0KDQojIENoYXJhY3RlciBjb2x1bW4gdG8gb25lIGhvdCBlbmNvZGluZw0KZm9yKGkgaW4gMTpuY29sKGVuX3Rlc3QpKXsNCiAgaWYoaXMuY2hhcmFjdGVyKGVuX3Rlc3RbLGldKSl7DQogICAgZW5fdGVzdFssaV0gPSBmYWN0b3IoZW5fdGVzdFssaV0sIGxldmVscyA9IGxldmVscyh0cmFpbl9mYWN0b3JzWyxpXSkpDQogIH0NCn0NCnRlc3Rfb25lX2hvdF9lbmNvZGluZ3MgPSBkYXRhLmZyYW1lKHByZWRpY3QoZHVtbXksIG5ld2RhdGE9ZW5fdGVzdFtzYXBwbHkoZW5fdGVzdCwgaXMuZmFjdG9yKV0pKQ0KZW5fdGVzdCA9IGNiaW5kKGVuX3Rlc3QsIHRlc3Rfb25lX2hvdF9lbmNvZGluZ3MpDQplbl90ZXN0ID0gZW5fdGVzdFssIW5hbWVzKGVuX3Rlc3QpICVpbiUgbmFtZXMoZW5fdGVzdFtzYXBwbHkoZW5fdGVzdCwgaXMuZmFjdG9yKV0pXQ0KDQojIFJlcGxhY2UgTkEgDQplbl90ZXN0Wyx3aGljaChuYW1lcyhlbl90ZXN0KSA9PSAiR2FyYWdlX1lyX0JsdCIpXSA9IHNhcHBseShlbl90ZXN0Wyx3aGljaChuYW1lcyhlbl90ZXN0KSA9PSAiR2FyYWdlX1lyX0JsdCIpXSxuYV9yZXBsYWNlKQ0KbmFfdmFsdWVzX21hdHJpeCA9IHdoaWNoKGlzLm5hKGVuX3Rlc3QpLCBhcnIuaW5kPVRSVUUpDQpmb3IoaSBpbiAxOm5yb3cobmFfdmFsdWVzX21hdHJpeCkpew0KICBlbl90ZXN0W25hX3ZhbHVlc19tYXRyaXhbaSwxXSxuYV92YWx1ZXNfbWF0cml4W2ksMl1dID0gMA0KfQ0KDQojIFdpbnNvcml6ZQ0KZm9yKHZhciBpbiB3aW5zb3IudmFycyl7DQogIHRtcCA9IGVuX3Rlc3RbLCB2YXJdDQogIG15cXVhbiA9IHF1YW50aWxlKHRtcCwgcHJvYnMgPSBxdWFuLnZhbHVlLCBuYS5ybSA9IFRSVUUpDQogIHRtcFt0bXAgPiBteXF1YW5dIDwtIG15cXVhbg0KICBlbl90ZXN0WywgdmFyXSA9IHRtcA0KfQ0KDQojIENvbnZlcnQgdG8gdGVzdCBtYXRyaXgNClgudGVzdCA9IGFzLm1hdHJpeChlbl90ZXN0WywhbmFtZXMoZW5fdGVzdCkgJWluJSBjKCdQSUQnKV0pDQoNCiMgUHJlZGljdA0KZW5fdGVzdC55LnByZWQgPSBleHAocHJlZGljdChjdi5vdXQsIHMgPSBiZXN0LmxhbSwgbmV3eD1YLnRlc3QpKQ0KZW5fdGVzdC55LnByZWQgPSBkYXRhLmZyYW1lKFBJRCA9IGVuX3Rlc3RbLDFdLCBTYWxlX1ByaWNlID0gZW5fdGVzdC55LnByZWRbLDFdKQ0Kd3JpdGUuY3N2KGVuX3Rlc3QueS5wcmVkLCBmaWxlID0gcGFzdGUoIm15c3VibWlzc2lvbl9lbl8iLGosIi50eHQiLCBzZXA9IiIpLCAgcXVvdGU9IEZBTFNFLCByb3cubmFtZXMgPSBGQUxTRSkNCg0KfQ0KZW5kX3RpbWUgPC0gU3lzLnRpbWUoKQ0KZW5kX3RpbWUgLSBzdGFydF90aW1lDQpgYGANCg0KYGBge3J9DQpmb3IoaiBpbiAxOjEwKXsNCmdibV9wcmVkIDwtIHJlYWQuY3N2KHBhc3RlKCJteXN1Ym1pc3Npb25fZ2JtXyIsaiwiLnR4dCIsIHNlcD0iIikpDQplbl9wcmVkIDwtIHJlYWQuY3N2KHBhc3RlKCJteXN1Ym1pc3Npb25fZW5fIixqLCIudHh0Iiwgc2VwPSIiKSkNCnRlc3QgPC0gZGF0YVt0ZXN0SURzWyxqXSwgXQ0KdGVzdC55IDwtIHRlc3RbLCBjKDEsIDgzKV0NCm5hbWVzKHRlc3QueSlbMl0gPC0gIlRydWVfU2FsZV9QcmljZSINCmdibV9wcmVkIDwtIG1lcmdlKGdibV9wcmVkLCB0ZXN0LnksIGJ5PSJQSUQiKQ0KZW5fcHJlZCA8LSBtZXJnZShlbl9wcmVkLCB0ZXN0LnksIGJ5PSJQSUQiKQ0KcHJpbnQoaikNCnByaW50KHNxcnQobWVhbigobG9nKGdibV9wcmVkJFNhbGVfUHJpY2UpIC0gbG9nKGdibV9wcmVkJFRydWVfU2FsZV9QcmljZSkpXjIpKSkNCnByaW50KHNxcnQobWVhbigobG9nKGVuX3ByZWQkU2FsZV9QcmljZSkgLSBsb2coZW5fcHJlZCRUcnVlX1NhbGVfUHJpY2UpKV4yKSkpDQp9DQpgYGANCg0K