美文网首页我爱编程
用python 实现 hive sql join和row_num

用python 实现 hive sql join和row_num

作者: 思索人生_mind | 来源:发表于2018-04-13 10:28 被阅读0次

    给定一个查询时间,找最近登录的用户

    create temporary function row_number as "com.ai.hive.udf.util.RowNumberUDF";

    select logint_time,username from 

    ( select ip,login_time,username from (

    select ip ,select_time from a join select ip,login_time,username from b on(a.ip=b.ip and a.login_time

    )t sort by login_time desc )p where row_number=1

    import pandas as pd

    import numpy as np

    login_column_names = ['ip','dip','type','uri','time']

    select_column_names = ['ip','dip','action','time']

    df_login = pd.read_csv('login.txt',sep='\t',encoding='utf-8',header=None,names=login_column_names)

    df_select = pd.read_csv('select.txt',sep='\t',encoding='utf-8',header=None,names=select_column_names)

    df_login[['username','password','authPassword','submit']]= df_login['uri'].str.replace('j_username=','').str.replace('password=','').str.split('&',expand=True)

    df_login

    x=[1,2,3,6,7,8]

    df_login.drop(df_login.columns[x], axis=1, inplace=True)

    df_login

    import time

    # time.strptime(df_login['time'],"%Y-%m-%d %H:%M:%S")

    df_login['time']

    df_login['new_time']=0

    a = 0

    for x in df_login['time']:

        print(int(time.mktime(time.strptime(x,"%Y-%m-%d %H:%M:%S.%f"))))

        df_login['new_time'][a] =  int(time.mktime(time.strptime(x,"%Y-%m-%d %H:%M:%S.%f")))

        a=a+1

    df_login['new_time']

    import time

    # time.strptime(df_login['time'],"%Y-%m-%d %H:%M:%S")

    df_select['time']

    df_select['new_time']=0

    a = 0

    for x in df_select['time']:

        print(int(time.mktime(time.strptime(x,"%Y-%m-%d %H:%M:%S.%f"))))

        df_select['new_time'][a] =  int(time.mktime(time.strptime(x,"%Y-%m-%d %H:%M:%S.%f")))

        a=a+1

    df_select

    df_on = df_select.merge(df_login,how='left',on=['ip'])

    df_on

    df_on['diff_time'] = df_on['new_time_x']-df_on['new_time_y']

    df_on

    df_on[df_on['diff_time']>=0]

    #找时间最小的那个

    df_on = df_on[df_on['diff_time']>=0]

    df_on

    df_on[['ip','time_x','username','diff_time']]

    def min_time(df,n=3,column='diff_time'):

        return df.sort_index(by=column,ascending=False)[-n:]

    df_on[['ip','time_x','username','diff_time']].groupby(['ip','time_x']).apply(min_time,n=1)

    相关文章

      网友评论

        本文标题:用python 实现 hive sql join和row_num

        本文链接:https://www.haomeiwen.com/subject/mfxckftx.html