5. Experiment1 :
Categorcal / Ordinal Value
Simple / Complex Value
로 나눈 후 예측값들로 새로운 DataSet
from sklearn.model_selection import KFold, cross_val_score
from sklearn.model_selection import GridSearchCV, train_test_split
from sklearn.preprocessing import OneHotEncoder, LabelEncoder
import pandas as pd
complex_lst=[8, 9, 10, 13, 14, 15, 22, 23, 25]
x = train.iloc[:, :-1]
y = train.iloc[:, -1]
x.apply(LabelEncoder().fit_transform)
test.apply(LabelEncoder().fit_transform)
train_x, val_x, train_y, val_y = train_test_split(x, y, test_size=0.1,random_state=777)
train_category =train_x.drop(['person_attribute_a_1','person_attribute_b','person_prefer_e','contents_attribute_e'],axis=1)
train_ordinal = train_x.iloc[:, [7,8,13,28]]
train_sim = train_category.drop(train_category.columns[complex_lst],axis=1)
train_com = train_category.iloc[:,complex_lst]
val_category =val_x.drop(['person_attribute_a_1','person_attribute_b','person_prefer_e','contents_attribute_e'],axis=1)
val_ordinal = val_x.iloc[:, [7,8,13,28]]
val_sim = val_category.drop(train_category.columns[complex_lst],axis=1)
val_com = val_category.iloc[:,complex_lst]
x_category =x.drop(['person_attribute_a_1','person_attribute_b','person_prefer_e','contents_attribute_e'],axis=1)
x_ordinal = x.iloc[:, [7,8,13,28]]
x_sim = x_category.drop(train_category.columns[complex_lst],axis=1)
x_com = x_category.iloc[:,complex_lst]
test_category =test.drop(['person_attribute_a_1','person_attribute_b','person_prefer_e','contents_attribute_e'],axis=1)
test_ordinal = test.iloc[:, [7,8,13,28]]
test_sim = test_category.drop(train_category.columns[complex_lst],axis=1)
test_com = test_category.iloc[:,complex_lst]
train_category=train_category.drop(['person_prefer_f','person_prefer_g'],axis=1)
val_category=val_category.drop(['person_prefer_f','person_prefer_g'],axis=1)
x_category=x_category.drop(['person_prefer_f','person_prefer_g'],axis=1)
test_category=test_category.drop(['person_prefer_f','person_prefer_g'],axis=1)
train_sim=train_sim.drop(['person_prefer_f','person_prefer_g'],axis=1)
val_sim=val_sim.drop(['person_prefer_f','person_prefer_g'],axis=1)
x_sim=x_sim.drop(['person_prefer_f','person_prefer_g'],axis=1)
test_sim=test_sim.drop(['person_prefer_f','person_prefer_g'],axis=1)
#train_category=pd.get_dummies(train_category,columns=['contents_attribute_j_1'])
#val_category=pd.get_dummies(val_category,columns=['contents_attribute_j_1'])
#x_category=pd.get_dummies(x_category,columns=['contents_attribute_j_1'])
#test_category=pd.get_dummies(test_category,columns=['contents_attribute_j_1'])
#train_sim = pd.get_dummies(train_sim,columns=train_sim.columns)
#val_sim = pd.get_dummies(val_sim,columns=val_sim.columns)
#x_sim = pd.get_dummies(x_sim,columns=x_sim.columns)
#test_sim = pd.get_dummies(test_sim,columns=test_sim.columns)
#print(train_sim.shape)
#print(val_sim.shape)
#print(x_sim.shape)
#print(test_sim.shape)
#mdl_ordinal = XGBRegressor(boosting='dart', colsample_bylevel=0.7, colsample_bytree=0.8,
# learning_rate=0.03, max_depth=10, metric=['binary'],
# objective='reg:linear',
# n_estimators=400, n_jobs=-1, num_iterations=10000, verbose=True)
#history_ordinal = mdl_ordinal.fit(train_ordinal,train_y,
# eval_set=[(train_ordinal,train_y),(val_ordinal,val_y)],
# eval_metric='rmse', early_stopping_rounds=30,verbose=5)
print(mdl_ordinal)
v_pred_ordinal = mdl_ordinal.predict(val_ordinal)
print(v_pred_ordinal)
threshold = 0.5
for i,v in enumerate(v_pred_ordinal):
if v < threshold :
v_pred_ordinal[i] = 0
else :
v_pred_ordinal[i] = 1
print(v_pred_ordinal)
f1_ordinal = f1_score(val_y, v_pred_ordinal)
print('Threshold: ',threshold)
print('Ordinal [val] f1-score :',f1_ordinal)
Threshold: 0.35
Ordinal [val] f1-score : 0.6677433132240671
Threshold: 0.4
Ordinal [val] f1-score : 0.6670015204170286
Threshold: 0.45
Ordinal [val] f1-score : 0.6589033352176372
Threshold: 0.5
Ordinal [val] f1-score : 0.5661227834897928
Threshold: 0.55
Ordinal [val] f1-score : 0.1482149672136835
Threshold: 0.6
Ordinal [val] f1-score : 0.0547424444022049
Treshold: 0.3
Ordinal [val] f1-score : 0.690094306175991
Treshold: 0.35
Ordinal [val] f1-score : 0.6927052742863855
Treshold: 0.4
Ordinal [val] f1-score : 0.691006178013722
Treshold: 0.45
Ordinal [val] f1-score : 0.6775345234085551
Treshold: 0.5
Ordinal [val] f1-score : 0.6435973073141215
Treshold: 0.55
Ordinal [val] f1-score : 0.5760907811400423
Treshold: 0.6
Ordinal [val] f1-score : 0.46003417006176894
-> Threshold값 낮을 때 f1 score 이득
print(mdl_category)
#acc_category = mdl_category.score(x_category,y)
v_pred_category = mdl_category.predict(val_category)
f1_category = f1_score(val_y,v_pred_category)
#print('Category [train+val] :', acc_category)
print('Category [val] f1-score :',f1_category)
print()
print(mdl_ordinal)
#acc_ordinal = mdl_ordinal.score(x_ordinal,y)
v_pred_ordinal = mdl_ordinal.predict(val_ordinal)
threshold = 0.35
v_pred_ordinal = mdl_ordinal.predict(val_ordinal)
for i,v in enumerate(v_pred_ordinal):
if v < threshold :
v_pred_ordinal[i] = 0
else :
v_pred_ordinal[i] = 1
f1_ordinal = f1_score(val_y, v_pred_ordinal)
#print('Ordinal [train+val] :', acc_ordinal)
print('Ordinal [val] f1-score :',f1_ordinal)
XGBClassifier(boosting='dart', colsample_bylevel=0.7, colsample_bytree=0.8,
learning_rate=0.03, max_depth=10, metric=['binary'],
n_estimators=400, n_jobs=-1, num_iterations=10000, verbose=True)
Category [val] f1-score : 0.6431776444082622
XGBRegressor(boosting='dart', colsample_bylevel=0.7, colsample_bytree=0.8,
learning_rate=0.03, max_depth=10, metric=['binary'],
n_estimators=400, n_jobs=-1, num_iterations=10000, verbose=True)
Ordinal [val] f1-score : 0.6677433132240671
from sklearn.linear_model import LogisticRegression
pred_category = mdl_category.predict(x_category)
pred_ordinal = mdl_ordinal.predict(x_ordinal)
x_pred = np.transpose(np.array([pred_category, pred_ordinal]))
lm = LogisticRegression()
lm.fit(x_pred, y)
preds = lm.predict(test)
print(preds)
submission = pd.read_csv('sample_submission.csv')
submission['target'] = preds
NAME = 'second try'
submission.to_csv('{}_splitted.csv'.format(NAME), index=False)
print('saved as {}_splitted.csv'.format(NAME))
####public : 0.6365825051
####
def threshold(threshold,mdl,val_x,val_y):
#print(mdl)
pred = mdl.predict(val_x)
print(pred)
for i,v in enumerate(pred):
if v < threshold :
pred[i] = 0
else :
pred[i] = 1
f1=f1_score(val_y,pred)
print('Treshold: ',threshold)
print('[val] f1-score :',f1)
return f1,pred
mdl_whole = XGBRegressor(boosting='dart', colsample_bylevel=0.7, colsample_bytree=0.8,
learning_rate=0.03, max_depth=10, metric=['binary'],
objective='reg:linear',
n_estimators=400, n_jobs=-1, num_iterations=10000, verbose=True)
history_ordinal = mdl_whole.fit(train_whole,train_y,
eval_set=[(train_whole,train_y),(val_whole,val_y)],
eval_metric='rmse', early_stopping_rounds=30,verbose=5)
for t in [0.35,0.4,0.45,0.5,0.55,0.6]:
threshold(t,mdl_whole,val_whole,val_y)
Stopping. Best iteration:
[59] validation_0-rmse:0.43533 validation_1-rmse:0.474642
Threshold: 0.35
Ordinal [val] f1-score : 0.6910436896962939
Threshold: 0.4
Ordinal [val] f1-score : 0.6836704586434121
Threshold: 0.45
Ordinal [val] f1-score : 0.6700268699179852
Threshold: 0.5
Ordinal [val] f1-score : 0.6383650827282008
Threshold: 0.55
Ordinal [val] f1-score : 0.5906855903635153
Threshold: 0.6
Ordinal [val] f1-score : 0.5115856205617121
##submission file
from datetime import datetime
import datetime as dt
import pytz
#Timezone
KST = pytz.timezone('Asia/Seoul')
NAME= datetime.now(KST).strftime('%m-%d %H-%M')
category = mdl_category.predict(test_category)
ordinal = mdl_ordinal.predict(test_ordinal)
test_whole = pd.DataFrame({'categorical':category,'ordinal':ordinal})
pred= mdl_whole.predict(test_whole)
threshold = 0.35
for i,v in enumerate(pred):
if v < threshold :
pred[i] = 0
else :
pred[i] = 1
submission.to_csv('{}_splitted.csv'.format(NAME), index=False)
print('saved as {}_splitted.csv'.format(NAME))
##그냥 평균
category = mdl_category.predict(val_category)
ordinal = mdl_ordinal.predict(val_ordinal)
whole=(category+ordinal)/2
#val_whole=pd.DataFrame({'whole':whole})
for threshold in [0.35,0.4,0.45,0.5,0.55,0.6]:
whole=(category+ordinal )/2
for i,v in enumerate(whole):
if v < threshold :
whole[i] = 0
else :
whole[i] = 1
f1=f1_score(whole,pred)
print('Threshold: ',threshold)
print('[val] f1-score :',f1)
Threshold: 0.35
[val] f1-score : 0.6821884314775897
Threshold: 0.4
[val] f1-score : 0.6909980542520315
Threshold: 0.45
[val] f1-score : 0.6896929824561403
Threshold: 0.5
[val] f1-score : 0.6451160587399336
Threshold: 0.55
[val] f1-score : 0.4763477767168185
Threshold: 0.6
[val] f1-score : 0.18897855980049183
##threshold : 0.5
##public : 0.6431026019
'Project > DACON' 카테고리의 다른 글
[DACON] 잡케어 추천 알고리즘 Part2 (0) | 2022.01.27 |
---|