#install.packages("drat", repos="https://cran.rstudio.com") #drat:::addRepo("dmlc") #install.packages("xgboost", repos="http://dmlc.ml/drat/", type = "source") #install.packages("xgboost") #install.packages(c("dplyr", "hflights")) #install.packages("dummies") #install.packages("MASS") require(xgboost) library(dplyr) library(hflights) library(dummies) library(MASS) setwd("C:/Users/POSCOUSER/Desktop/") train = read.csv("train.csv") test = read.csv("test.csv") col_names = colnames(test) # 트레인 데이터 준비 train_x = subset(train, select=col_names[col_names != "TARGET"]) train_y = subset(train, select=col_names[col_names == "TARGET"]) cate_list_x = c("CH_pass_fac_flag", "SM_STEEL_GRD", "CH_SEQ_IN_CAST", "SAME_TUND_TR_CAST_SEQ", "FAC_OP_CD", "FCE_NUM", "SM_LD_BLW_METH_TP", "LD_FCE_AC_UBLW_PTRN_NUM", "LD_FCE_LW_BW_PTRN_NUM", "BAP_SILSI_gubun", "FAC_OP_CD", "RH_TP", "RH_PRC_PAS_NUM", "RH_DELTA_T_FLAG", "SECOND_REF_RH_PRC_PTRN", "RH_VACCUM_PRC_TIM", "RH_sun_OB_flag", "RH_OB_over_50_flag", "RH_COOL", "RH__cool_500_flag", "RH_over_500_flag", "LEADTIME_TP", "TD_CH_DELTA_T_flag", "TD_uncond_DELTA_T_flag", "TD_UZ_AR_change", "TD_UZ_AR_hu_change_flag", "TD_SN_TUNDISH_MI_OPEN_H", "TUNDISH_SN_OPEN_H_VAR_2_OV", "TUNDISH_SN_OPEN_H_VAR_4_OV", "TUNDISH_SN_OPEN_H_VAR_6_OV", "TUNDISH_SN_OPEN_H_VAR_8_OV", "CC_SETT_NOZ_CLOSE_X_M1_FLAG", "S_AL_LOSS_OVER_80", "AL_FALG", "DAN_CHANGE", "FC_change", "EMS_CONT", "EMS_PATTERN", "MOLD_HEIGHT_3mm", "mold_change_5mm", "mold_change_flag", "SLAB_wid_flag", "SH_CASTP_ASN_TP", "UNCOND_SLAB_ASS_FLAG", "SF_flag", "order_abnor_SF_flag") # X 데이터 가공 fn <- function(x) x * 10/max(x, na.rm = TRUE) pre_train_x = dummy.data.frame(train_x, names=cate_list_x, sep="_") ind = sapply(pre_train_x, is.numeric) pre_train_x[ind] <- data.frame(lapply(pre_train_x[ind], fn)) #ind = sapply(pre_train_x, is.numeric) #pre_train_x[ind] <- lapply(pre_train_x[ind], scale) head(pre_train_x) # Y 데이터 가공 train_y$TARGET = recode(train_y$TARGET, OK='1', FAIL='0') train_y$TARGET = as.integer(train_y$TARGET) - 1 # 전처리 데이터 저장 write.csv(pre_train_x, "c:/pretrained.csv") head(dtrain_x) bstSparse = xgboost(data = dtrain_x, label= dtrain_y, max.depth = 7, eta = 5, nthread=2, nround =10, objective = "binary:logistic") # 테스트 데이터 준비 test_x = subset(test, select=col_names[col_names != "TARGET"]) test_y = subset(test, select=col_names[col_names == "TARGET"]) pre_test_x = dummy.data.frame(test_x, names=cate_list_x, sep="_") test_y$TARGET = recode(test_y$TARGET, OK='1', FAIL='0') test_y$TARGET = as.integer(test_y$TARGET) - 1 dtest_x = data.matrix(pre_test_x) dtest_y = data.matrix(test_y) # 테스트 데이터로 평가 pred = predict(bstSparse, dtrain_x) pred_binary = as.integer(round(pred, 1)) ev <- function(A,B){ cor = 1 incor = 1 for (i in c(1:length(A))) { if(A[i] - B[i] == 0) { cor = cor + 1 } else { incor = incor + 1 } } print(cor/(cor+incor)) return (cor/(cor+incor)) } train_y$TARGET[0] pred_binary result = ev(test_y$TARGET, pred_binary) result