Expedia数据挖掘（Kaggle比赛）

作者: SJTU_JORY | 来源:发表于2018-09-08 12:58 被阅读0次

Expedia数据挖掘（Kaggle比赛）
参加kaggle-home credit比赛 top1经验分享
Kaggle 数据挖掘比赛经验分享
XGBoost算法原理
初学者如何玩转Kaggle——Titanic生存预测
2018-03-23
玩转kaggle
kaggle的kernel和colab比较
想搞数据挖掘分析，应该学习java还是python？
如何在kaggle上使用fastai v1.0（上）

1.使用了SGDLR和Random Forest俩种方法
2.结合data leakage
3.最终得分49.999，在kaggle排行榜中能排到104位（共1700多队）
1.leakage solution

# -*- coding: utf-8 -*-
from heapq import nlargest
from operator import itemgetter

def leakage_deal():
    f=open("train.csv", "r")
    f.readline()    
    best_hotels_odd_ulc={}
    best_hotels_miss_odd={}
    best_h00={}
    best_h01={}
    count=0
    #counts
    while 1:
        line=f.readline().strip()
        count+=1
        if line == '':
            break
        arr=line.split(",")
        book_year=int(arr[0][:4])
        book_month=int(arr[0][5:7])
        user_location_city=arr[5]
        orig_destination_distance=arr[6]
        user_id=arr[7]
        srch_destination_id=arr[16]
        hotel_country=arr[21]
        hotel_market=arr[22]
        is_booking=float(arr[18])
        hotel_cluster=arr[23]

        relative_ref_month=((book_year-2012)*12+(book_month-12))
        append_weight=relative_ref_month*relative_ref_month*(3+17.60*is_booking)

        if user_location_city!='' and orig_destination_distance!='' and user_id!='' and srch_destination_id!= '' and hotel_country!= '':
            s00=hash(str(user_id)+':'+str(user_location_city)+':'+str(srch_destination_id)+':'+str(hotel_country)+':'+str(hotel_market))
            if s00 in best_h00:
                if hotel_cluster in best_h00[s00]:
                    best_h00[s00][hotel_cluster] += append_weight
                else:
                    best_h00[s00][hotel_cluster] = append_weight
            else:
                best_h00[s00] = {}
                best_h00[s00][hotel_cluster] = append_weight

        if user_location_city!='' and orig_destination_distance!='' and user_id!='' and srch_destination_id!= '':
            s01=hash(str(user_id)+':'+str(srch_destination_id)+':'+str(hotel_country)+':'+str(hotel_market))
            if s01 in best_h01:
                if hotel_cluster in best_h01[s01]:
                    best_h01[s01][hotel_cluster]+=append_weight
                else:
                    best_h01[s01][hotel_cluster]=append_weight
            else:
                best_h01[s01]={}
                best_h01[s01][hotel_cluster]=append_weight


        if user_location_city!= '' and orig_destination_distance=='' and user_id!='' and srch_destination_id!='' and hotel_country!='':
            s0 = hash(str(user_id)+':'+str(user_location_city)+':'+str(srch_destination_id)+':'+str(hotel_country)+':'+str(hotel_market))
            if s0 in best_hotels_miss_odd:
                if hotel_cluster in best_hotels_miss_odd[s0]:
                    best_hotels_miss_odd[s0][hotel_cluster]+=append_weight
                else:
                    best_hotels_miss_odd[s0][hotel_cluster]=append_weight
            else:
                best_hotels_miss_odd[s0]={}
                best_hotels_miss_odd[s0][hotel_cluster]=append_weight

        if user_location_city!='' and orig_destination_distance!='':
            s1 = hash(str(user_location_city)+':'+str(orig_destination_distance))

            if s1 in best_hotels_odd_ulc:
                if hotel_cluster in best_hotels_odd_ulc[s1]:
                    best_hotels_odd_ulc[s1][hotel_cluster]+=relative_ref_month
                else:
                    best_hotels_odd_ulc[s1][hotel_cluster]=relative_ref_month
            else:
                best_hotels_odd_ulc[s1]={}
                best_hotels_odd_ulc[s1][hotel_cluster]=relative_ref_month

    f.close()
    return best_h00,best_h01, best_hotels_odd_ulc, best_hotels_miss_odd

def submit(best_h00, best_h01, best_hotels_odd_ulc, best_hotels_miss_odd):
    path='leakage_deal.csv'
    out=open(path, "w")
    f=open("test.csv", "r")
    f.readline()
    count=0
    count0=0
    count00=0
    count1=0
    out.write("id,hotel_cluster\n")
    while 1:
        line=f.readline().strip()
        count+=1
        if count % 100000 == 0:
            print('Write {} lines...'.format(count))
        if line == '':
            break
        arr=line.split(",")
        id=arr[0]
        user_location_city=arr[6]
        orig_destination_distance=arr[7]
        user_id=arr[8]
        srch_destination_id=arr[17]
        hotel_country=arr[20]
        hotel_market=arr[21]
        out.write(str(id) + ',')
        filled=[]
        s1=hash(str(user_location_city)+':'+str(orig_destination_distance))
        if s1 in best_hotels_odd_ulc:
            d=best_hotels_odd_ulc[s1]
            topitems=nlargest(5, sorted(d.items()), key=itemgetter(1))
            for i in range(len(topitems)):
                if topitems[i][0] in filled:
                    continue
                if len(filled) == 5:
                    break
                out.write(' ' + topitems[i][0])
                filled.append(topitems[i][0])
                count1 += 1
        if orig_destination_distance == '':
            s0=hash(str(user_id)+':'+str(user_location_city)+':'+str(srch_destination_id)+':'+str(hotel_country)+':'+str(hotel_market))
            if s0 in best_hotels_miss_odd:
                d=best_hotels_miss_odd[s0]
                topitems=nlargest(4, sorted(d.items()), key=itemgetter(1))
                for i in range(len(topitems)):
                    if topitems[i][0] in filled:
                        continue
                    if len(filled) == 5:
                        break
                    out.write(' ' + topitems[i][0])
                    filled.append(topitems[i][0])
                    count0+=1
        s00=hash(str(user_id)+':'+str(user_location_city)+':'+str(srch_destination_id)+':'+str(hotel_country)+':'+str(hotel_market))
        s01=hash(str(user_id)+':'+str(srch_destination_id)+':'+str(hotel_country)+':'+str(hotel_market))
        if s01 in best_h01 and s00 not in best_h00:
            d=best_h01[s01]
            topitems=nlargest(4, sorted(d.items()), key=itemgetter(1))
            for i in range(len(topitems)):
                if topitems[i][0] in filled:
                    continue
                if len(filled) == 5:
                    break
                out.write(' ' + topitems[i][0])
                filled.append(topitems[i][0])
                count00 += 1
        out.write("\n")
    out.close()
    print('count 1=',count1)
    print('count 0=',count0)
    print('count 00=',count00)

best_h00,best_h01, best_hotels_odd_ulc, best_hotels_miss_odd = leakage_deal()
submit(best_h00, best_h01, best_hotels_odd_ulc, best_hotels_miss_odd)

2.Random Forest

# -*- coding: utf-8 -*-
import pandas as pd
import numpy as np
import h5py
from sklearn.ensemble import RandomForestClassifier

def pre_deal(data):
    '''data_pre_deal'''
    try:
        data.loc[data.srch_ci.str.endswith('00'),'srch_ci'] = '2015-12-31'
        data['srch_ci'] = data.srch_ci.astype(np.datetime64)
        data.loc[data.date_time.str.endswith('00'),'date_time'] = '2015-12-31'
        data['date_time'] = data.date_time.astype(np.datetime64)
    except:
        pass
    data.fillna(0, inplace=True)
    #calculate the duration in hotel
    data['live_in_days'] = data.srch_co-data.srch_ci
    data['live_in_days'] = data['live_in_days'].apply(lambda ts: ts/np.timedelta64(1,'D'))
    #calculate the time from book to live in the hotel
    data['date_to_live_days'] = data.srch_ci-data.date_time
    data['date_to_live_days'] = data['date_to_live_days'].apply(lambda ts: ts/np.timedelta64(1,'D'))
    data['ci_month'] = data['srch_ci'].apply(lambda dt: dt.month)
    data['ci_day'] = data['srch_ci'].apply(lambda dt: dt.day)
    data['date_month'] = data['date_time'].apply(lambda dt: dt.month)
    data['date_day'] = data['date_time'].apply(lambda dt: dt.day)
    data['date_hour'] = data['date_time'].apply(lambda dt: dt.hour)
    data.drop(['date_time', 'user_id', 'srch_ci', 'srch_co'], axis=1, inplace=True)
import os
if os.path.exists('srch_dest_hc_hm_agg.csv'):
    agg1 = pd.read_csv('srch_dest_hc_hm_agg.csv')
else:
    reader = pd.read_csv('train.csv', parse_dates=['date_time', 'srch_ci', 'srch_co'], chunksize=200000)
    pieces = [chunk.groupby(['srch_destination_id','hotel_country','hotel_market','hotel_cluster'])['is_booking'].agg(['sum','count']) for chunk in reader]
    agg = pd.concat(pieces).groupby(level=[0,1,2,3]).sum()
    agg.dropna(inplace=True)
    agg['sum_and_cnt'] = 0.85*agg['sum'] + 0.15*agg['count']
    agg = agg.groupby(level=[0,1,2]).apply(lambda x: x.astype(float)/x.sum())
    agg.reset_index(inplace=True)
    agg1 = agg.pivot_table(index=['srch_destination_id','hotel_country','hotel_market'], columns='hotel_cluster', values='sum_and_cnt').reset_index()
    agg1.to_csv('srch_dest_hc_hm_agg.csv', index=False)
    #clean memory
    del pieces,agg

destinations = pd.read_csv('destinations.csv')
submission = pd.read_csv('sample_submission.csv')

clf=RandomForestClassifier(n_estimators=100, n_jobs=-1, warm_start=True)
count=0
chunksize=200000
reader = pd.read_csv('train.csv', parse_dates=['date_time', 'srch_ci', 'srch_co'], chunksize=chunksize)
for chunk in reader:
    try:
        chunk = chunk[chunk.is_booking==1]
        chunk = pd.merge(chunk, destinations, how='left', on='srch_destination_id')
        chunk = pd.merge(chunk, agg1, how='left', on=['srch_destination_id','hotel_country','hotel_market'])
        pre_deal(chunk)
        y = chunk.hotel_cluster
        chunk.drop(['cnt', 'hotel_cluster', 'is_booking'], axis=1, inplace=True)
        if len(y.unique()) == 100:
            clf.set_params(n_estimators=clf.n_estimators+1)
            clf.fit(chunk, y)
        count = count + chunksize
        print(count,' have done')
        if(count/chunksize == 300):
            break
    except Exception as e:
        print(str(e))
        pass

count = 0
chunksize = 10000
preds = np.empty((submission.shape[0],clf.n_classes_))
reader = pd.read_csv('test.csv', parse_dates=['date_time', 'srch_ci', 'srch_co'], chunksize=chunksize)
for chunk in reader:
    chunk = pd.merge(chunk, destinations, how='left', on='srch_destination_id')
    chunk = pd.merge(chunk, agg1, how='left', on=['srch_destination_id','hotel_country','hotel_market'])
    chunk.drop(['id'], axis=1, inplace=True)
    pre_deal(chunk)
    pred = clf.predict_proba(chunk)
    preds[count:(count + chunk.shape[0]),:] = pred
    count = count + chunksize
    print(count,' have done')
del clf,agg1

if os.path.exists('rf.h5'):
    with h5py.File('rf.h5', 'r+') as hf:
            predslatesthf = hf['preds_latest']
            preds += predslatesthf.value
            predslatesthf[...] = preds
else:
    with h5py.File('rf.h5', 'w') as hf:
        hf.create_dataset('preds_latest', data=preds)
fea_ind = np.argsort(-preds, axis=1)[:,:5]
happend = [' '.join(row.astype(str)) for row in fea_ind]
submit = pd.DataFrame(data=happend, index=submission.id)
submit.reset_index(inplace=True)
submit.columns = submission.columns
submit.to_csv('rf_deal.csv', index=False)

3.SGDLR

# -*- coding: utf-8 -*-
import pandas as pd
from scipy.sparse import csr_matrix, hstack
import numpy as np
import h5py
import pickle
from sklearn.linear_model import SGDClassifier
import os
cat_col = ['user_id','user_location_city','srch_destination_id','srch_destination_type_id',
           'hotel_continent','hotel_country', 'hotel_market']
num_col = ['is_mobile', 'is_package']
def bin_time(t):
    if t < 0:
        x = 0
    elif t < 2:
        x = 1
    elif t < 7:
        x = 2
    elif t < 30:
        x = 3
    else:
        x = 4
    return x

def pre_process(data):
    try:
        data.loc[data.srch_ci.str.endswith('00'),'srch_ci'] = '2015-12-31'
        data['srch_ci'] = data.srch_ci.astype(np.datetime64)
        data.loc[data.date_time.str.endswith('00'),'date_time'] = '2015-12-31'
        data['date_time'] = data.date_time.astype(np.datetime64)
    except:
        pass
    data.fillna(0, inplace=True)
    data['ci_month'] = data['srch_ci'].apply(lambda dt: dt.month)
    data['season_dest'] = 'season_dest' + data.ci_month.map(str) + '*' + data.srch_destination_id.map(str)
    data['season_dest'] = data['season_dest'].map(hash)
    data['date_to_live_days'] = data.srch_ci-data.date_time
    data['date_to_live_days'] = data['date_to_live_days'].apply(lambda td: td/np.timedelta64(1, 'D'))
    data['date_to_live_days'] = data['date_to_live_days'].map(bin_time)
    data['time_dest'] = 'time_dest' + data.date_to_live_days.map(str) + '*' + data.srch_destination_id.map(str)
    data['time_dest'] = data['time_dest'].map(hash)
    
    for col in cat_col:
        data[col] = col + data[col].map(str)
        data[col] = data[col].map(hash)

submission = pd.read_csv('sample_submission.csv')
cat_col_all = cat_col + ['season_dest', 'time_dest']
def map5eval(preds, actual):
    '''evaluate standard'''
    predicted = preds.argsort(axis=1)[:,-np.arange(5)]
    metric = 0.
    for i in range(5):
        metric += np.sum(actual==predicted[:,i])/(i+1)
    metric /= actual.shape[0]
    return metric

if os.path.exists('sgd.pkl'):
    with open('sgd.pkl', 'rb') as f:
        clf = pickle.load(f)
else:
    clf = SGDClassifier(loss='log', alpha=0.0000025, verbose=0)
#clf.sparsify()
for epoch in range(5):
    count = 0
    chunksize = 200000
    n_features = 3000000
    print('Epoch: ', epoch)
    reader = pd.read_csv('train.csv', parse_dates=['date_time', 'srch_ci', 'srch_co'], chunksize=chunksize)
    for chunk in reader:
        try:
            pre_process(chunk)
            y = chunk.hotel_cluster
            sw = 1 + 4*chunk.is_booking
            chunk.drop(['cnt', 'hotel_cluster', 'is_booking'], axis=1, inplace=True)
            XN = csr_matrix(chunk[num_col].values)
            X = csr_matrix((chunk.shape[0], n_features))
            rows = np.arange(chunk.shape[0])
            for col in cat_col_all:
                dat = np.ones(chunk.shape[0])
                cols = chunk[col] % n_features
                X += csr_matrix((dat, (rows, cols)), shape=(chunk.shape[0], n_features))
            X = hstack((XN, X))
            book_indices = sw[sw > 1].index.tolist()
            x_indices=[(x-count) for x in book_indices]
            X_test = csr_matrix(X)[x_indices]
            y_test = y[book_indices]
            clf.partial_fit(X, y, classes=np.arange(100), sample_weight=sw)         
            count = count + chunksize
            map5 = map5eval(clf.predict_proba(X_test), y_test)
            print((count, map5),' have done')
            if(count/chunksize == 200):
                break
        except Exception as e:
            count = count + chunksize
            print(str(e))
            pass

with open('sgd.pkl', 'wb') as f:
    pickle.dump(clf, f)

count = 0
chunksize = 10000
preds = np.empty((0,100))
reader = pd.read_csv('test.csv', parse_dates=['date_time', 'srch_ci', 'srch_co'], chunksize=chunksize)
for chunk in reader:
    chunk.drop(['id'], axis=1, inplace=True)
    pre_process(chunk)
    XN = csr_matrix(chunk[num_col].values)
    X = csr_matrix((chunk.shape[0], n_features))
    rows = np.arange(chunk.shape[0])
    for col in cat_col_all:
        dat = np.ones(chunk.shape[0])
        cols = chunk[col] % n_features
        X += csr_matrix((dat, (rows, cols)), shape=(chunk.shape[0], n_features))
    X = hstack((XN, X))
    pred = clf.predict_proba(X)
    preds = np.vstack((preds, pred))
    count = count + chunksize
    print(count,' have done')
del clf

if os.path.exists('sgd.h5'):
    with h5py.File('sgd.h5', 'r+') as hf:
        predshf = hf['preds']
        predshf[...] = preds
else:
    with h5py.File('sgd.h5', 'w') as hf:
        hf.create_dataset('preds', data=preds)

col_ind = np.argsort(-preds, axis=1)[:,:5]
hc = [' '.join(row.astype(str)) for row in col_ind]
submit = pd.DataFrame(data=hc, index=submission.id)
submit.reset_index(inplace=True)
submit.columns = submission.columns
submit.to_csv('sgdlr_deal.csv', index=False)

4.blend

# -*- coding: utf-8 -*-
import pandas as pd
import numpy as np
from sklearn.preprocessing import normalize
import h5py

submission = pd.read_csv('sample_submission.csv')

# read in RF results
with h5py.File('rf.h5', 'r') as hf:
        predshf = hf['preds_latest']
        preds = 0.54*normalize(predshf.value, norm='l1', axis=1)


# read in SGD results
with h5py.File('../output/probs/allpreds_sgd.h5', 'r') as hf:
        predshf = hf['preds']
        preds += 0.46*normalize(predshf.value, norm='l1', axis=1)


col_ind = np.argsort(-preds, axis=1)[:,:5]
hc = [' '.join(row.astype(str)) for row in col_ind]

sub = pd.DataFrame(data=hc, index=submission.id)
sub.reset_index(inplace=True)
sub.columns = submission.columns
sub.to_csv('blend_deal.csv', index=False)

5.stack

# -*- coding: utf-8 -*-
import pandas as pd

match_pred = pd.read_csv('leakage_deal.csv')
match_pred.fillna('', inplace=True)
match_pred = match_pred['hotel_cluster'].tolist()
match_pred = [s.split(' ') for s in match_pred]

pred_sub = pd.read_csv('blend_deal.csv')
ids = pred_sub.id
pred_sub = pred_sub['hotel_cluster'].tolist()
pred_sub = [s.split(' ') for s in pred_sub]

def f0(seq, idfun=None): 
    if idfun is None:
        def idfun(x): return x
    seen = {}
    result = []
    for item in seq:
        marker = idfun(item)
        if (marker in seen) or (marker == ''): continue
        seen[marker] = 1
        result.append(item)
    return result
    
full_preds = [f0(match_pred[p] + pred_sub[p])[:5] for p in range(len(pred_sub))]

write_p = [" ".join([str(l) for l in p]) for p in full_preds]
write_frame = ["{0},{1}".format(ids[i], write_p[i]) for i in range(len(full_preds))]
write_frame = ["id,hotel_cluster"] + write_frame
with open("final_predictions.csv", "w+") as f:
    f.write("\n".join(write_frame))

结果：

image.png

Expedia数据挖掘（Kaggle比赛）
1.使用了SGDLR和Random Forest俩种方法2.结合data leakage3.最终得分49.999，...
参加kaggle-home credit比赛 top1经验分享
为入门数据挖掘领域，我参加kaggle比赛积累经验，参加比赛时间21天，排名62/6660，达到top1。 ima...
Kaggle 数据挖掘比赛经验分享
文章发布于公号【数智物语】（ID：decision_engine），关注公号不错过每一篇干货。来源 | 腾讯广告...
XGBoost算法原理
XGBoost是数据挖掘类竞赛中经常使用的一大利器，它帮助选手在Kaggle、阿里天池大数据比赛等比赛取得了很好的...
初学者如何玩转Kaggle——Titanic生存预测
相信数据挖掘爱好者们都听说过kaggle这个竞赛平台，相比国内的天池大数据平台而言，Kaggle中的项目更多，而且...
2018-03-23
在kaggle或天池的比赛中，常涉及到一些非常经典的数据挖掘或者其他数学概念。遂做总结。欧式空间平面几何和空间...
玩转kaggle
kaggle是一个数据分析（data analysis）、数据挖掘（data mining）、机器学习（machi...
kaggle的kernel和colab比较
1 Kaggle的kernel 非常适合比赛，当然是kaggle的比赛，有以下三点我认为非常好的功能。数据集不用...
想搞数据挖掘分析，应该学习java还是python？
想搞数据挖掘分析，应该学习java还是python？可以参考这个：Kaggle: The Home of Dat...
如何在kaggle上使用fastai v1.0（上）
这篇文章会教大家在kaggle上使用fastai v1.0来进行图片分类。 kaggle是一个在数据挖掘领域非常著...