美文网首页
Expedia数据挖掘(Kaggle比赛)

Expedia数据挖掘(Kaggle比赛)

作者: SJTU_JORY | 来源:发表于2018-09-08 12:58 被阅读0次

    1.使用了SGDLR和Random Forest俩种方法
    2.结合data leakage
    3.最终得分49.999,在kaggle排行榜中能排到104位(共1700多队)
    1.leakage solution

    # -*- coding: utf-8 -*-
    from heapq import nlargest
    from operator import itemgetter
    
    def leakage_deal():
        f=open("train.csv", "r")
        f.readline()    
        best_hotels_odd_ulc={}
        best_hotels_miss_odd={}
        best_h00={}
        best_h01={}
        count=0
        #counts
        while 1:
            line=f.readline().strip()
            count+=1
            if line == '':
                break
            arr=line.split(",")
            book_year=int(arr[0][:4])
            book_month=int(arr[0][5:7])
            user_location_city=arr[5]
            orig_destination_distance=arr[6]
            user_id=arr[7]
            srch_destination_id=arr[16]
            hotel_country=arr[21]
            hotel_market=arr[22]
            is_booking=float(arr[18])
            hotel_cluster=arr[23]
    
            relative_ref_month=((book_year-2012)*12+(book_month-12))
            append_weight=relative_ref_month*relative_ref_month*(3+17.60*is_booking)
    
            if user_location_city!='' and orig_destination_distance!='' and user_id!='' and srch_destination_id!= '' and hotel_country!= '':
                s00=hash(str(user_id)+':'+str(user_location_city)+':'+str(srch_destination_id)+':'+str(hotel_country)+':'+str(hotel_market))
                if s00 in best_h00:
                    if hotel_cluster in best_h00[s00]:
                        best_h00[s00][hotel_cluster] += append_weight
                    else:
                        best_h00[s00][hotel_cluster] = append_weight
                else:
                    best_h00[s00] = {}
                    best_h00[s00][hotel_cluster] = append_weight
    
            if user_location_city!='' and orig_destination_distance!='' and user_id!='' and srch_destination_id!= '':
                s01=hash(str(user_id)+':'+str(srch_destination_id)+':'+str(hotel_country)+':'+str(hotel_market))
                if s01 in best_h01:
                    if hotel_cluster in best_h01[s01]:
                        best_h01[s01][hotel_cluster]+=append_weight
                    else:
                        best_h01[s01][hotel_cluster]=append_weight
                else:
                    best_h01[s01]={}
                    best_h01[s01][hotel_cluster]=append_weight
    
    
            if user_location_city!= '' and orig_destination_distance=='' and user_id!='' and srch_destination_id!='' and hotel_country!='':
                s0 = hash(str(user_id)+':'+str(user_location_city)+':'+str(srch_destination_id)+':'+str(hotel_country)+':'+str(hotel_market))
                if s0 in best_hotels_miss_odd:
                    if hotel_cluster in best_hotels_miss_odd[s0]:
                        best_hotels_miss_odd[s0][hotel_cluster]+=append_weight
                    else:
                        best_hotels_miss_odd[s0][hotel_cluster]=append_weight
                else:
                    best_hotels_miss_odd[s0]={}
                    best_hotels_miss_odd[s0][hotel_cluster]=append_weight
    
            if user_location_city!='' and orig_destination_distance!='':
                s1 = hash(str(user_location_city)+':'+str(orig_destination_distance))
    
                if s1 in best_hotels_odd_ulc:
                    if hotel_cluster in best_hotels_odd_ulc[s1]:
                        best_hotels_odd_ulc[s1][hotel_cluster]+=relative_ref_month
                    else:
                        best_hotels_odd_ulc[s1][hotel_cluster]=relative_ref_month
                else:
                    best_hotels_odd_ulc[s1]={}
                    best_hotels_odd_ulc[s1][hotel_cluster]=relative_ref_month
    
        f.close()
        return best_h00,best_h01, best_hotels_odd_ulc, best_hotels_miss_odd
    
    def submit(best_h00, best_h01, best_hotels_odd_ulc, best_hotels_miss_odd):
        path='leakage_deal.csv'
        out=open(path, "w")
        f=open("test.csv", "r")
        f.readline()
        count=0
        count0=0
        count00=0
        count1=0
        out.write("id,hotel_cluster\n")
        while 1:
            line=f.readline().strip()
            count+=1
            if count % 100000 == 0:
                print('Write {} lines...'.format(count))
            if line == '':
                break
            arr=line.split(",")
            id=arr[0]
            user_location_city=arr[6]
            orig_destination_distance=arr[7]
            user_id=arr[8]
            srch_destination_id=arr[17]
            hotel_country=arr[20]
            hotel_market=arr[21]
            out.write(str(id) + ',')
            filled=[]
            s1=hash(str(user_location_city)+':'+str(orig_destination_distance))
            if s1 in best_hotels_odd_ulc:
                d=best_hotels_odd_ulc[s1]
                topitems=nlargest(5, sorted(d.items()), key=itemgetter(1))
                for i in range(len(topitems)):
                    if topitems[i][0] in filled:
                        continue
                    if len(filled) == 5:
                        break
                    out.write(' ' + topitems[i][0])
                    filled.append(topitems[i][0])
                    count1 += 1
            if orig_destination_distance == '':
                s0=hash(str(user_id)+':'+str(user_location_city)+':'+str(srch_destination_id)+':'+str(hotel_country)+':'+str(hotel_market))
                if s0 in best_hotels_miss_odd:
                    d=best_hotels_miss_odd[s0]
                    topitems=nlargest(4, sorted(d.items()), key=itemgetter(1))
                    for i in range(len(topitems)):
                        if topitems[i][0] in filled:
                            continue
                        if len(filled) == 5:
                            break
                        out.write(' ' + topitems[i][0])
                        filled.append(topitems[i][0])
                        count0+=1
            s00=hash(str(user_id)+':'+str(user_location_city)+':'+str(srch_destination_id)+':'+str(hotel_country)+':'+str(hotel_market))
            s01=hash(str(user_id)+':'+str(srch_destination_id)+':'+str(hotel_country)+':'+str(hotel_market))
            if s01 in best_h01 and s00 not in best_h00:
                d=best_h01[s01]
                topitems=nlargest(4, sorted(d.items()), key=itemgetter(1))
                for i in range(len(topitems)):
                    if topitems[i][0] in filled:
                        continue
                    if len(filled) == 5:
                        break
                    out.write(' ' + topitems[i][0])
                    filled.append(topitems[i][0])
                    count00 += 1
            out.write("\n")
        out.close()
        print('count 1=',count1)
        print('count 0=',count0)
        print('count 00=',count00)
    
    best_h00,best_h01, best_hotels_odd_ulc, best_hotels_miss_odd = leakage_deal()
    submit(best_h00, best_h01, best_hotels_odd_ulc, best_hotels_miss_odd)
    

    2.Random Forest

    # -*- coding: utf-8 -*-
    import pandas as pd
    import numpy as np
    import h5py
    from sklearn.ensemble import RandomForestClassifier
    
    def pre_deal(data):
        '''data_pre_deal'''
        try:
            data.loc[data.srch_ci.str.endswith('00'),'srch_ci'] = '2015-12-31'
            data['srch_ci'] = data.srch_ci.astype(np.datetime64)
            data.loc[data.date_time.str.endswith('00'),'date_time'] = '2015-12-31'
            data['date_time'] = data.date_time.astype(np.datetime64)
        except:
            pass
        data.fillna(0, inplace=True)
        #calculate the duration in hotel
        data['live_in_days'] = data.srch_co-data.srch_ci
        data['live_in_days'] = data['live_in_days'].apply(lambda ts: ts/np.timedelta64(1,'D'))
        #calculate the time from book to live in the hotel
        data['date_to_live_days'] = data.srch_ci-data.date_time
        data['date_to_live_days'] = data['date_to_live_days'].apply(lambda ts: ts/np.timedelta64(1,'D'))
        data['ci_month'] = data['srch_ci'].apply(lambda dt: dt.month)
        data['ci_day'] = data['srch_ci'].apply(lambda dt: dt.day)
        data['date_month'] = data['date_time'].apply(lambda dt: dt.month)
        data['date_day'] = data['date_time'].apply(lambda dt: dt.day)
        data['date_hour'] = data['date_time'].apply(lambda dt: dt.hour)
        data.drop(['date_time', 'user_id', 'srch_ci', 'srch_co'], axis=1, inplace=True)
    import os
    if os.path.exists('srch_dest_hc_hm_agg.csv'):
        agg1 = pd.read_csv('srch_dest_hc_hm_agg.csv')
    else:
        reader = pd.read_csv('train.csv', parse_dates=['date_time', 'srch_ci', 'srch_co'], chunksize=200000)
        pieces = [chunk.groupby(['srch_destination_id','hotel_country','hotel_market','hotel_cluster'])['is_booking'].agg(['sum','count']) for chunk in reader]
        agg = pd.concat(pieces).groupby(level=[0,1,2,3]).sum()
        agg.dropna(inplace=True)
        agg['sum_and_cnt'] = 0.85*agg['sum'] + 0.15*agg['count']
        agg = agg.groupby(level=[0,1,2]).apply(lambda x: x.astype(float)/x.sum())
        agg.reset_index(inplace=True)
        agg1 = agg.pivot_table(index=['srch_destination_id','hotel_country','hotel_market'], columns='hotel_cluster', values='sum_and_cnt').reset_index()
        agg1.to_csv('srch_dest_hc_hm_agg.csv', index=False)
        #clean memory
        del pieces,agg
    
    destinations = pd.read_csv('destinations.csv')
    submission = pd.read_csv('sample_submission.csv')
    
    clf=RandomForestClassifier(n_estimators=100, n_jobs=-1, warm_start=True)
    count=0
    chunksize=200000
    reader = pd.read_csv('train.csv', parse_dates=['date_time', 'srch_ci', 'srch_co'], chunksize=chunksize)
    for chunk in reader:
        try:
            chunk = chunk[chunk.is_booking==1]
            chunk = pd.merge(chunk, destinations, how='left', on='srch_destination_id')
            chunk = pd.merge(chunk, agg1, how='left', on=['srch_destination_id','hotel_country','hotel_market'])
            pre_deal(chunk)
            y = chunk.hotel_cluster
            chunk.drop(['cnt', 'hotel_cluster', 'is_booking'], axis=1, inplace=True)
            if len(y.unique()) == 100:
                clf.set_params(n_estimators=clf.n_estimators+1)
                clf.fit(chunk, y)
            count = count + chunksize
            print(count,' have done')
            if(count/chunksize == 300):
                break
        except Exception as e:
            print(str(e))
            pass
    
    count = 0
    chunksize = 10000
    preds = np.empty((submission.shape[0],clf.n_classes_))
    reader = pd.read_csv('test.csv', parse_dates=['date_time', 'srch_ci', 'srch_co'], chunksize=chunksize)
    for chunk in reader:
        chunk = pd.merge(chunk, destinations, how='left', on='srch_destination_id')
        chunk = pd.merge(chunk, agg1, how='left', on=['srch_destination_id','hotel_country','hotel_market'])
        chunk.drop(['id'], axis=1, inplace=True)
        pre_deal(chunk)
        pred = clf.predict_proba(chunk)
        preds[count:(count + chunk.shape[0]),:] = pred
        count = count + chunksize
        print(count,' have done')
    del clf,agg1
    
    if os.path.exists('rf.h5'):
        with h5py.File('rf.h5', 'r+') as hf:
                predslatesthf = hf['preds_latest']
                preds += predslatesthf.value
                predslatesthf[...] = preds
    else:
        with h5py.File('rf.h5', 'w') as hf:
            hf.create_dataset('preds_latest', data=preds)
    fea_ind = np.argsort(-preds, axis=1)[:,:5]
    happend = [' '.join(row.astype(str)) for row in fea_ind]
    submit = pd.DataFrame(data=happend, index=submission.id)
    submit.reset_index(inplace=True)
    submit.columns = submission.columns
    submit.to_csv('rf_deal.csv', index=False)
    

    3.SGDLR

    # -*- coding: utf-8 -*-
    import pandas as pd
    from scipy.sparse import csr_matrix, hstack
    import numpy as np
    import h5py
    import pickle
    from sklearn.linear_model import SGDClassifier
    import os
    cat_col = ['user_id','user_location_city','srch_destination_id','srch_destination_type_id',
               'hotel_continent','hotel_country', 'hotel_market']
    num_col = ['is_mobile', 'is_package']
    def bin_time(t):
        if t < 0:
            x = 0
        elif t < 2:
            x = 1
        elif t < 7:
            x = 2
        elif t < 30:
            x = 3
        else:
            x = 4
        return x
    
    def pre_process(data):
        try:
            data.loc[data.srch_ci.str.endswith('00'),'srch_ci'] = '2015-12-31'
            data['srch_ci'] = data.srch_ci.astype(np.datetime64)
            data.loc[data.date_time.str.endswith('00'),'date_time'] = '2015-12-31'
            data['date_time'] = data.date_time.astype(np.datetime64)
        except:
            pass
        data.fillna(0, inplace=True)
        data['ci_month'] = data['srch_ci'].apply(lambda dt: dt.month)
        data['season_dest'] = 'season_dest' + data.ci_month.map(str) + '*' + data.srch_destination_id.map(str)
        data['season_dest'] = data['season_dest'].map(hash)
        data['date_to_live_days'] = data.srch_ci-data.date_time
        data['date_to_live_days'] = data['date_to_live_days'].apply(lambda td: td/np.timedelta64(1, 'D'))
        data['date_to_live_days'] = data['date_to_live_days'].map(bin_time)
        data['time_dest'] = 'time_dest' + data.date_to_live_days.map(str) + '*' + data.srch_destination_id.map(str)
        data['time_dest'] = data['time_dest'].map(hash)
        
        for col in cat_col:
            data[col] = col + data[col].map(str)
            data[col] = data[col].map(hash)
    
    submission = pd.read_csv('sample_submission.csv')
    cat_col_all = cat_col + ['season_dest', 'time_dest']
    def map5eval(preds, actual):
        '''evaluate standard'''
        predicted = preds.argsort(axis=1)[:,-np.arange(5)]
        metric = 0.
        for i in range(5):
            metric += np.sum(actual==predicted[:,i])/(i+1)
        metric /= actual.shape[0]
        return metric
    
    if os.path.exists('sgd.pkl'):
        with open('sgd.pkl', 'rb') as f:
            clf = pickle.load(f)
    else:
        clf = SGDClassifier(loss='log', alpha=0.0000025, verbose=0)
    #clf.sparsify()
    for epoch in range(5):
        count = 0
        chunksize = 200000
        n_features = 3000000
        print('Epoch: ', epoch)
        reader = pd.read_csv('train.csv', parse_dates=['date_time', 'srch_ci', 'srch_co'], chunksize=chunksize)
        for chunk in reader:
            try:
                pre_process(chunk)
                y = chunk.hotel_cluster
                sw = 1 + 4*chunk.is_booking
                chunk.drop(['cnt', 'hotel_cluster', 'is_booking'], axis=1, inplace=True)
                XN = csr_matrix(chunk[num_col].values)
                X = csr_matrix((chunk.shape[0], n_features))
                rows = np.arange(chunk.shape[0])
                for col in cat_col_all:
                    dat = np.ones(chunk.shape[0])
                    cols = chunk[col] % n_features
                    X += csr_matrix((dat, (rows, cols)), shape=(chunk.shape[0], n_features))
                X = hstack((XN, X))
                book_indices = sw[sw > 1].index.tolist()
                x_indices=[(x-count) for x in book_indices]
                X_test = csr_matrix(X)[x_indices]
                y_test = y[book_indices]
                clf.partial_fit(X, y, classes=np.arange(100), sample_weight=sw)         
                count = count + chunksize
                map5 = map5eval(clf.predict_proba(X_test), y_test)
                print((count, map5),' have done')
                if(count/chunksize == 200):
                    break
            except Exception as e:
                count = count + chunksize
                print(str(e))
                pass
    
    with open('sgd.pkl', 'wb') as f:
        pickle.dump(clf, f)
    
    count = 0
    chunksize = 10000
    preds = np.empty((0,100))
    reader = pd.read_csv('test.csv', parse_dates=['date_time', 'srch_ci', 'srch_co'], chunksize=chunksize)
    for chunk in reader:
        chunk.drop(['id'], axis=1, inplace=True)
        pre_process(chunk)
        XN = csr_matrix(chunk[num_col].values)
        X = csr_matrix((chunk.shape[0], n_features))
        rows = np.arange(chunk.shape[0])
        for col in cat_col_all:
            dat = np.ones(chunk.shape[0])
            cols = chunk[col] % n_features
            X += csr_matrix((dat, (rows, cols)), shape=(chunk.shape[0], n_features))
        X = hstack((XN, X))
        pred = clf.predict_proba(X)
        preds = np.vstack((preds, pred))
        count = count + chunksize
        print(count,' have done')
    del clf
    
    if os.path.exists('sgd.h5'):
        with h5py.File('sgd.h5', 'r+') as hf:
            predshf = hf['preds']
            predshf[...] = preds
    else:
        with h5py.File('sgd.h5', 'w') as hf:
            hf.create_dataset('preds', data=preds)
    
    col_ind = np.argsort(-preds, axis=1)[:,:5]
    hc = [' '.join(row.astype(str)) for row in col_ind]
    submit = pd.DataFrame(data=hc, index=submission.id)
    submit.reset_index(inplace=True)
    submit.columns = submission.columns
    submit.to_csv('sgdlr_deal.csv', index=False)
    
    

    4.blend

    # -*- coding: utf-8 -*-
    import pandas as pd
    import numpy as np
    from sklearn.preprocessing import normalize
    import h5py
    
    submission = pd.read_csv('sample_submission.csv')
    
    # read in RF results
    with h5py.File('rf.h5', 'r') as hf:
            predshf = hf['preds_latest']
            preds = 0.54*normalize(predshf.value, norm='l1', axis=1)
    
    
    # read in SGD results
    with h5py.File('../output/probs/allpreds_sgd.h5', 'r') as hf:
            predshf = hf['preds']
            preds += 0.46*normalize(predshf.value, norm='l1', axis=1)
    
    
    col_ind = np.argsort(-preds, axis=1)[:,:5]
    hc = [' '.join(row.astype(str)) for row in col_ind]
    
    sub = pd.DataFrame(data=hc, index=submission.id)
    sub.reset_index(inplace=True)
    sub.columns = submission.columns
    sub.to_csv('blend_deal.csv', index=False)
    
    

    5.stack

    # -*- coding: utf-8 -*-
    import pandas as pd
    
    match_pred = pd.read_csv('leakage_deal.csv')
    match_pred.fillna('', inplace=True)
    match_pred = match_pred['hotel_cluster'].tolist()
    match_pred = [s.split(' ') for s in match_pred]
    
    pred_sub = pd.read_csv('blend_deal.csv')
    ids = pred_sub.id
    pred_sub = pred_sub['hotel_cluster'].tolist()
    pred_sub = [s.split(' ') for s in pred_sub]
    
    def f0(seq, idfun=None): 
        if idfun is None:
            def idfun(x): return x
        seen = {}
        result = []
        for item in seq:
            marker = idfun(item)
            if (marker in seen) or (marker == ''): continue
            seen[marker] = 1
            result.append(item)
        return result
        
    full_preds = [f0(match_pred[p] + pred_sub[p])[:5] for p in range(len(pred_sub))]
    
    write_p = [" ".join([str(l) for l in p]) for p in full_preds]
    write_frame = ["{0},{1}".format(ids[i], write_p[i]) for i in range(len(full_preds))]
    write_frame = ["id,hotel_cluster"] + write_frame
    with open("final_predictions.csv", "w+") as f:
        f.write("\n".join(write_frame))
    
    

    结果:


    image.png
    image.png

    相关文章

      网友评论

          本文标题:Expedia数据挖掘(Kaggle比赛)

          本文链接:https://www.haomeiwen.com/subject/pagcgftx.html