XgBoost Test

#install.packages("drat", repos="https://cran.rstudio.com")
#drat:::addRepo("dmlc")
#install.packages("xgboost", repos="http://dmlc.ml/drat/", type = "source")
#install.packages("xgboost")
#install.packages(c("dplyr", "hflights"))
#install.packages("dummies")
#install.packages("MASS")

require(xgboost)
library(dplyr)
library(hflights)
library(dummies)
library(MASS)

setwd("C:/Users/POSCOUSER/Desktop/")
train = read.csv("train.csv")
test = read.csv("test.csv")

col_names = colnames(test)

# 트레인 데이터 준비 
train_x = subset(train, select=col_names[col_names != "TARGET"])
train_y = subset(train, select=col_names[col_names == "TARGET"])

cate_list_x = c("CH_pass_fac_flag", "SM_STEEL_GRD", "CH_SEQ_IN_CAST", "SAME_TUND_TR_CAST_SEQ", "FAC_OP_CD", "FCE_NUM", "SM_LD_BLW_METH_TP", "LD_FCE_AC_UBLW_PTRN_NUM",
                "LD_FCE_LW_BW_PTRN_NUM", "BAP_SILSI_gubun", "FAC_OP_CD", "RH_TP", "RH_PRC_PAS_NUM", "RH_DELTA_T_FLAG", "SECOND_REF_RH_PRC_PTRN", "RH_VACCUM_PRC_TIM",
                "RH_sun_OB_flag",	"RH_OB_over_50_flag", "RH_COOL", "RH__cool_500_flag", "RH_over_500_flag", "LEADTIME_TP", "TD_CH_DELTA_T_flag", "TD_uncond_DELTA_T_flag",
                "TD_UZ_AR_change", "TD_UZ_AR_hu_change_flag", "TD_SN_TUNDISH_MI_OPEN_H",	"TUNDISH_SN_OPEN_H_VAR_2_OV",	"TUNDISH_SN_OPEN_H_VAR_4_OV",	"TUNDISH_SN_OPEN_H_VAR_6_OV",
                "TUNDISH_SN_OPEN_H_VAR_8_OV", "CC_SETT_NOZ_CLOSE_X_M1_FLAG", "S_AL_LOSS_OVER_80", "AL_FALG", "DAN_CHANGE", "FC_change", "EMS_CONT", "EMS_PATTERN", 
                "MOLD_HEIGHT_3mm", "mold_change_5mm", "mold_change_flag", "SLAB_wid_flag", "SH_CASTP_ASN_TP", "UNCOND_SLAB_ASS_FLAG", "SF_flag", "order_abnor_SF_flag")

# X 데이터 가공 
fn <- function(x) x * 10/max(x, na.rm = TRUE)
pre_train_x = dummy.data.frame(train_x, names=cate_list_x, sep="_")
ind = sapply(pre_train_x, is.numeric)
pre_train_x[ind] <- data.frame(lapply(pre_train_x[ind], fn))
#ind = sapply(pre_train_x, is.numeric)
#pre_train_x[ind] <- lapply(pre_train_x[ind], scale)
head(pre_train_x)

# Y 데이터 가공 
train_y$TARGET = recode(train_y$TARGET, OK='1', FAIL='0')
train_y$TARGET = as.integer(train_y$TARGET) - 1

# 전처리 데이터 저장 
write.csv(pre_train_x, "c:/pretrained.csv")

head(dtrain_x)
bstSparse = xgboost(data = dtrain_x, label= dtrain_y, max.depth = 7, eta = 5, nthread=2, nround =10, objective = "binary:logistic")


# 테스트 데이터 준비 
test_x = subset(test, select=col_names[col_names != "TARGET"])
test_y = subset(test, select=col_names[col_names == "TARGET"])

pre_test_x = dummy.data.frame(test_x, names=cate_list_x, sep="_")
test_y$TARGET = recode(test_y$TARGET, OK='1', FAIL='0')
test_y$TARGET = as.integer(test_y$TARGET) - 1

dtest_x  = data.matrix(pre_test_x)  
dtest_y  = data.matrix(test_y)  


# 테스트 데이터로 평가 
pred = predict(bstSparse, dtrain_x)
pred_binary = as.integer(round(pred, 1))


ev <- function(A,B){
  cor = 1
  incor = 1

  for (i in c(1:length(A)))
  {
    if(A[i] - B[i] == 0) {
      cor = cor + 1
    }
    else {
      incor = incor + 1
    }
  }
  print(cor/(cor+incor))
  return (cor/(cor+incor))
}

train_y$TARGET[0]
pred_binary
result = ev(test_y$TARGET, pred_binary)
result


Leave a Reply

Your email address will not be published. Required fields are marked *