Pandas

作者: manbug | 来源:发表于2017-09-18 15:18 被阅读0次
    import pandas as pd
    

    Excel Sheet

    1. read one
    pd.read_excel("xxx.xlsx", sheetname="first")
    
    1. read all
    di = pd.read_excel("xxx.xlsx", sheetname=None)
    return a dict: {"sheetname": xx, ...}
    
    1. write Sheets
    writer = pd.ExcelWriter("xxx.xlsx", engine="xlsxwriter")
    for df in dfs:
        df.to_excel(writer, sheet_name="xx")
    
    1. write time
    df.to_excel("xxx.xlsx", engin="openpyxl")
    
    1. read sql
    from sqlalchemy import create_engine
    db_url = "postgresql+psycopg2://db_name:password@ip/db_name"
    engine = create_engine(db_url)
    SQL_QUERY = """ xxx """
    df = pd.read_sql(SQL_QUERY, engine)
    
    1. del Column / Rows
    del df["column"]
    df = df[:500]
    
    1. Judge nan
    import math
    xx = df.get_value(index, column)
    math.isnan(xx)
    
    1. Write in memory (Supply download)
    import io
    buffer = io.BytesIO()
    writer = pd.ExcelWriter(buffer, engine='xlsxwriter')
    df = ...
    df.to_excel(wirter)
    writer.save()
    data = buffer.getvalue()
    response = HttpResponse(data)
    response['Content-Type'] = 'application/octet-stream'
    response['Content-Disposition'] = 'attachment;filename="{0}"'.format("target.xlsx")
    

    Read Json

    from pandas.io.json import json_normalize
    di = {"a": 1, "b": 2}
    df = json_normalize(di)
    

    Create New Column

    df = ...
    df["new_line"] = df["line1"] + df["line2"]
    

    Groupby

    合并重复项
    df.groupby(["小区名称", "行政区", "面积", "所在层" ...])["时间"].min().reset_index()    合并重复项,时间取最小值
    # 注: 如果某一行作为groupby的列为空(nan),那么这一列必定不会出现在groupby的结果里
    可以先填充空值: df.fillna("-")
    

    Calc Rate

    计算每一部分占总体的比例
    total = len(df)
    pd.value_counts(pd.cut(df["column"], bins=[字段分割])) / total
    

    合并

    df = pd.concat([df1, df2])
    

    读取大的csv文件

    reader = pd.read_csv("/home/manbug/小区.csv", iterator=True)
    # r = reader.get_chunk(5)
    while reader:
        r = reader.get_chunk(5)
        TODO...
    

    filter技巧

    索引
    df.ix["xxx"]
    字符串
    df[df["抓取时间"].str.startswith('2017')]
    时间
    df[df["时间"].dt.month>5]
    

    转list

    Series:
    df["MAC地址"].tolist()
    # df["MAC地址"].values.tolist()
    

    相关文章

      网友评论

        本文标题:Pandas

        本文链接:https://www.haomeiwen.com/subject/jtrbsxtx.html