본문 바로가기

Project/DACON

[DACON] 잡케어 추천 알고리즘 Part3

5. Experiment1 :


Categorcal / Ordinal Value

Simple / Complex Value

로 나눈 후 예측값들로 새로운 DataSet 

 


from sklearn.model_selection import KFold, cross_val_score
from sklearn.model_selection import GridSearchCV, train_test_split
from sklearn.preprocessing import OneHotEncoder, LabelEncoder
import pandas as pd

complex_lst=[8, 9, 10, 13, 14, 15, 22, 23, 25]

x = train.iloc[:, :-1]
y = train.iloc[:, -1]

x.apply(LabelEncoder().fit_transform)
test.apply(LabelEncoder().fit_transform)

train_x, val_x, train_y, val_y = train_test_split(x, y, test_size=0.1,random_state=777)

train_category =train_x.drop(['person_attribute_a_1','person_attribute_b','person_prefer_e','contents_attribute_e'],axis=1)
train_ordinal = train_x.iloc[:, [7,8,13,28]]
train_sim = train_category.drop(train_category.columns[complex_lst],axis=1)
train_com = train_category.iloc[:,complex_lst]

val_category =val_x.drop(['person_attribute_a_1','person_attribute_b','person_prefer_e','contents_attribute_e'],axis=1)
val_ordinal = val_x.iloc[:, [7,8,13,28]]
val_sim = val_category.drop(train_category.columns[complex_lst],axis=1)
val_com = val_category.iloc[:,complex_lst]


x_category =x.drop(['person_attribute_a_1','person_attribute_b','person_prefer_e','contents_attribute_e'],axis=1)
x_ordinal = x.iloc[:, [7,8,13,28]]
x_sim = x_category.drop(train_category.columns[complex_lst],axis=1)
x_com = x_category.iloc[:,complex_lst]
 
test_category =test.drop(['person_attribute_a_1','person_attribute_b','person_prefer_e','contents_attribute_e'],axis=1)
test_ordinal = test.iloc[:, [7,8,13,28]]
test_sim = test_category.drop(train_category.columns[complex_lst],axis=1)
test_com = test_category.iloc[:,complex_lst]

train_category=train_category.drop(['person_prefer_f','person_prefer_g'],axis=1)
val_category=val_category.drop(['person_prefer_f','person_prefer_g'],axis=1)
x_category=x_category.drop(['person_prefer_f','person_prefer_g'],axis=1)
test_category=test_category.drop(['person_prefer_f','person_prefer_g'],axis=1)

train_sim=train_sim.drop(['person_prefer_f','person_prefer_g'],axis=1)
val_sim=val_sim.drop(['person_prefer_f','person_prefer_g'],axis=1)
x_sim=x_sim.drop(['person_prefer_f','person_prefer_g'],axis=1)
test_sim=test_sim.drop(['person_prefer_f','person_prefer_g'],axis=1)

#train_category=pd.get_dummies(train_category,columns=['contents_attribute_j_1'])
#val_category=pd.get_dummies(val_category,columns=['contents_attribute_j_1'])
#x_category=pd.get_dummies(x_category,columns=['contents_attribute_j_1'])
#test_category=pd.get_dummies(test_category,columns=['contents_attribute_j_1'])


#train_sim = pd.get_dummies(train_sim,columns=train_sim.columns)
#val_sim = pd.get_dummies(val_sim,columns=val_sim.columns)
#x_sim = pd.get_dummies(x_sim,columns=x_sim.columns)
#test_sim = pd.get_dummies(test_sim,columns=test_sim.columns)

#print(train_sim.shape)
#print(val_sim.shape)
#print(x_sim.shape)
#print(test_sim.shape)

 

 

 

 

#mdl_ordinal = XGBRegressor(boosting='dart', colsample_bylevel=0.7, colsample_bytree=0.8,
#              learning_rate=0.03, max_depth=10, metric=['binary'],
#              objective='reg:linear',
#              n_estimators=400, n_jobs=-1, num_iterations=10000, verbose=True)

#history_ordinal = mdl_ordinal.fit(train_ordinal,train_y,
#                                  eval_set=[(train_ordinal,train_y),(val_ordinal,val_y)],
#                                  eval_metric='rmse', early_stopping_rounds=30,verbose=5)

print(mdl_ordinal)
v_pred_ordinal =  mdl_ordinal.predict(val_ordinal)
print(v_pred_ordinal)

threshold = 0.5
for i,v in enumerate(v_pred_ordinal):
  if v < threshold :
    v_pred_ordinal[i] = 0
  else :
    v_pred_ordinal[i] = 1

print(v_pred_ordinal)


f1_ordinal = f1_score(val_y, v_pred_ordinal)
print('Threshold: ',threshold)
print('Ordinal [val] f1-score :',f1_ordinal)
Threshold:  0.35
Ordinal [val] f1-score : 0.6677433132240671

Threshold:  0.4
Ordinal [val] f1-score : 0.6670015204170286

Threshold:  0.45
Ordinal [val] f1-score : 0.6589033352176372

Threshold:  0.5
Ordinal [val] f1-score : 0.5661227834897928

Threshold:  0.55
Ordinal [val] f1-score : 0.1482149672136835

Threshold:  0.6
Ordinal [val] f1-score : 0.0547424444022049
Treshold:  0.3
Ordinal [val] f1-score : 0.690094306175991

Treshold:  0.35
Ordinal [val] f1-score : 0.6927052742863855

Treshold:  0.4
Ordinal [val] f1-score : 0.691006178013722

Treshold:  0.45
Ordinal [val] f1-score : 0.6775345234085551

Treshold:  0.5
Ordinal [val] f1-score : 0.6435973073141215

Treshold:  0.55
Ordinal [val] f1-score : 0.5760907811400423

Treshold:  0.6
Ordinal [val] f1-score : 0.46003417006176894

 

-> Threshold값 낮을 때 f1 score 이득

 

print(mdl_category)
#acc_category = mdl_category.score(x_category,y)
v_pred_category =  mdl_category.predict(val_category)
f1_category = f1_score(val_y,v_pred_category)
#print('Category [train+val] :', acc_category)
print('Category [val] f1-score :',f1_category) 

print()
print(mdl_ordinal)
#acc_ordinal = mdl_ordinal.score(x_ordinal,y)
v_pred_ordinal =  mdl_ordinal.predict(val_ordinal)

threshold = 0.35
v_pred_ordinal =  mdl_ordinal.predict(val_ordinal)
for i,v in enumerate(v_pred_ordinal):
  if v < threshold :
      v_pred_ordinal[i] = 0
  else :
      v_pred_ordinal[i] = 1
f1_ordinal = f1_score(val_y, v_pred_ordinal)
#print('Ordinal [train+val] :', acc_ordinal)
print('Ordinal [val] f1-score :',f1_ordinal) 


XGBClassifier(boosting='dart', colsample_bylevel=0.7, colsample_bytree=0.8,
              learning_rate=0.03, max_depth=10, metric=['binary'],
              n_estimators=400, n_jobs=-1, num_iterations=10000, verbose=True)
Category [val] f1-score : 0.6431776444082622

XGBRegressor(boosting='dart', colsample_bylevel=0.7, colsample_bytree=0.8,
             learning_rate=0.03, max_depth=10, metric=['binary'],
             n_estimators=400, n_jobs=-1, num_iterations=10000, verbose=True)
Ordinal [val] f1-score : 0.6677433132240671
from sklearn.linear_model import LogisticRegression
pred_category = mdl_category.predict(x_category)
pred_ordinal = mdl_ordinal.predict(x_ordinal)


x_pred = np.transpose(np.array([pred_category, pred_ordinal]))
lm = LogisticRegression()
lm.fit(x_pred, y)


preds = lm.predict(test)

print(preds)

submission = pd.read_csv('sample_submission.csv')
submission['target'] = preds

NAME = 'second try'

submission.to_csv('{}_splitted.csv'.format(NAME), index=False)
print('saved as {}_splitted.csv'.format(NAME))

####public : 0.6365825051

 

 

 

####

def threshold(threshold,mdl,val_x,val_y):
    #print(mdl)
    pred =  mdl.predict(val_x)
    print(pred)
    for i,v in enumerate(pred):
        if v < threshold :
            pred[i] = 0
        else :
            pred[i] = 1
    f1=f1_score(val_y,pred)
    print('Treshold: ',threshold)
    print('[val] f1-score :',f1) 
    return f1,pred
mdl_whole = XGBRegressor(boosting='dart', colsample_bylevel=0.7, colsample_bytree=0.8,
              learning_rate=0.03, max_depth=10, metric=['binary'],
              objective='reg:linear',
              n_estimators=400, n_jobs=-1, num_iterations=10000, verbose=True)

history_ordinal = mdl_whole.fit(train_whole,train_y,
                                  eval_set=[(train_whole,train_y),(val_whole,val_y)],
                                  eval_metric='rmse', early_stopping_rounds=30,verbose=5)

for t in [0.35,0.4,0.45,0.5,0.55,0.6]:
    threshold(t,mdl_whole,val_whole,val_y)
    

Stopping. Best iteration:
[59]	validation_0-rmse:0.43533	validation_1-rmse:0.474642

Threshold:  0.35
Ordinal [val] f1-score : 0.6910436896962939
Threshold:  0.4
Ordinal [val] f1-score : 0.6836704586434121
Threshold:  0.45
Ordinal [val] f1-score : 0.6700268699179852
Threshold:  0.5
Ordinal [val] f1-score : 0.6383650827282008
Threshold:  0.55
Ordinal [val] f1-score : 0.5906855903635153
Threshold:  0.6
Ordinal [val] f1-score : 0.5115856205617121
##submission file

from datetime import datetime
import datetime as dt
import pytz
#Timezone
KST = pytz.timezone('Asia/Seoul')
NAME= datetime.now(KST).strftime('%m-%d %H-%M')

category =  mdl_category.predict(test_category)
ordinal = mdl_ordinal.predict(test_ordinal)
test_whole = pd.DataFrame({'categorical':category,'ordinal':ordinal})


pred= mdl_whole.predict(test_whole)

threshold = 0.35
for i,v in enumerate(pred):
    if v < threshold :
        pred[i] = 0
    else :
        pred[i] = 1

submission.to_csv('{}_splitted.csv'.format(NAME), index=False)
print('saved as {}_splitted.csv'.format(NAME))
##그냥 평균
category =  mdl_category.predict(val_category)
ordinal = mdl_ordinal.predict(val_ordinal)
whole=(category+ordinal)/2
#val_whole=pd.DataFrame({'whole':whole})

for threshold in [0.35,0.4,0.45,0.5,0.55,0.6]:
    whole=(category+ordinal )/2
    for i,v in enumerate(whole):
        if v < threshold :
            whole[i] = 0
        else :
            whole[i] = 1
    
    f1=f1_score(whole,pred)
    print('Threshold: ',threshold)
    print('[val] f1-score :',f1) 
    
    
Threshold:  0.35
[val] f1-score : 0.6821884314775897
Threshold:  0.4
[val] f1-score : 0.6909980542520315
Threshold:  0.45
[val] f1-score : 0.6896929824561403
Threshold:  0.5
[val] f1-score : 0.6451160587399336
Threshold:  0.55
[val] f1-score : 0.4763477767168185
Threshold:  0.6
[val] f1-score : 0.18897855980049183

##threshold : 0.5
##public : 0.6431026019

 

 

'Project > DACON' 카테고리의 다른 글

[DACON] 잡케어 추천 알고리즘 Part2  (0) 2022.01.27