美文网首页Python语言学习
Python数据可视化(一):散点图绘制

Python数据可视化(一):散点图绘制

作者: Davey1220 | 来源:发表于2021-03-03 21:33 被阅读0次

    输入数据格式

    image.png

    使用matplotlib包绘制散点图

    # 导入所需的python包
    import numpy as np
    import pandas as pd
    import matplotlib.pyplot as plt
    import seaborn as sns
    
    # 设置绘图格式
    plt.style.use('seaborn')
    %matplotlib inline
    
    # 创建示例数据集
    df=pd.DataFrame({'x': range(1,101), 'y': np.random.randn(100)*15+range(1,101) })
    
    #  查看示例数据头和尾的各5行
    df.head(5).append(df.tail(5))
    
    x y
    0 1 7.821203
    1 2 8.372683
    2 3 10.616092
    3 4 -0.183374
    4 5 18.387730
    95 96 101.110453
    96 97 102.630476
    97 98 90.080476
    98 99 121.161754
    99 100 78.376947
    # 绘制基础散点图
    plt.plot( 'x', 'y', data=df, linestyle='none', marker='o')
    plt.show()
    
    image.png

    设置点的形状

    # marker参数设置点的形状
    
    # === first figure:
    plt.plot( 'x', 'y', data=df, linestyle='none', marker='*')
    plt.show()
    
    # === second figure:
    # 所有点的类型
    all_poss=['.','o','v','^','>','<','s','p','*','h','H','D','d','1','','']
    
    # to see all possibilities:
    # markers.MarkerStyle.markers.keys()
    
    # set the limit of x and y axis:
    # 设置x和y轴的范围
    plt.xlim(0.5,4.5)
    plt.ylim(0.5,4.5)
    
    # remove ticks and values of axis:
    # 去除x和y轴的刻度
    plt.xticks([])
    plt.yticks([])
    #plt.set_xlabel(size=0)
    
    # Make a loop to add markers one by one
    num=0
    for x in range(1,5):
      for y in range(1,5):
        num += 1
        plt.plot(x,y,marker=all_poss[num-1], markerfacecolor='orange', markersize=23, markeredgecolor="black")
        # add text annotation
        plt.text(x+0.2, y, all_poss[num-1], horizontalalignment='left', size='medium', color='black', weight='semibold')
    
    image.png image.png

    设置点的大小

    # markersize参数设置点的大小
    plt.plot( 'x', 'y', data=df, linestyle='none', marker='D', markersize=16)
    plt.show()
    
    image.png

    设置点的颜色

    # markerfacecolor参数设置点的颜色,markeredgecolor参数设置点边框的颜色, markeredgewidth参数设置点边框的宽度
    plt.plot( 'x', 'y', data=df, linestyle='none', marker="o", markersize=16, markerfacecolor='skyblue', markeredgecolor="black")
    plt.show()
    
    plt.plot( 'x', 'y', data=df, linestyle='none', marker='D', markersize=16, markeredgecolor="orange", markeredgewidth=5)
    plt.show()
    
    image.png image.png
    # 添加连接线,linestyle参数设置线的类型
    
    plt.plot( 'x', 'y', data=df, linestyle='-', marker='o')
    plt.show()
    
    image.png

    添加注释信息

    # Basic chart
    df=pd.DataFrame({'x': range(1,101), 'y': np.random.randn(100)*15+range(1,101) })
    plt.plot( 'x', 'y', data=df, linestyle='none', marker='o')
    
    # 添加文本注释和箭头
    # Annotate with text + Arrow
    plt.annotate(
    # Label and coordinate
    'This point is interesting!', xy=(25, 50), xytext=(0, 80),
    
    # Custom arrow
    arrowprops=dict(facecolor='black', shrink=0.05)
    )
    
    image.png
    # plot
    df=pd.DataFrame({'x': range(1,101), 'y': np.random.randn(100)*15+range(1,101) })
    plt.plot( 'x', 'y', data=df, linestyle='none', marker='o')
    
    # Annotation
    plt.text(40, 00, r'equation: $\sum_{i=0}^\infty x_i/pre>, fontsize=20)
    
    image.png
    # Plot
    df=pd.DataFrame({'x': range(1,101), 'y': np.random.randn(100)*15+range(1,101) })
    plt.plot( 'x', 'y', data=df, linestyle='none', marker='o')
    
    # Annotation
    # 添加垂直线
    plt.axvline(40, color='r')
    # 添加水平线
    plt.axhline(40, color='green')
    
    image.png
    # libraries
    import matplotlib.patches as patches
    
    # Plot
    fig1 = plt.figure()
    ax1 = fig1.add_subplot(111)
    ax1.plot( 'x', 'y', data=df, linestyle='none', marker='o')
    
    # Add rectangle
    # 添加矩形区
    ax1.add_patch(
        patches.Rectangle(
            (20, 25), # (x,y)
            50, # width
            50, # height
            # You can add rotation as well with 'angle'
            alpha=0.3, facecolor="red", edgecolor="black", linewidth=3, linestyle='solid'
        )
    )
    
    image.png
    # Plot
    fig1 = plt.figure()
    ax1 = fig1.add_subplot(111)
    ax1.plot( 'x', 'y', data=df, linestyle='none', marker='o')
    
    # Annotation
    # 添加圆圈区
    ax1.add_patch(
        patches.Circle(
            (40, 35),           # (x,y)
            30,                    # radius
            alpha=0.3, facecolor="green", edgecolor="black", linewidth=1, linestyle='solid'
        )
    )
    
    image.png

    避免点的重叠

    # Dataset:
    # 构建示例数据集
    df=pd.DataFrame({'x': np.random.normal(10, 1.2, 20000), 'y': np.random.normal(10, 1.2, 20000), 'group': np.repeat('A',20000) })
    tmp1=pd.DataFrame({'x': np.random.normal(14.5, 1.2, 20000), 'y': np.random.normal(14.5, 1.2, 20000), 'group': np.repeat('B',20000) })
    tmp2=pd.DataFrame({'x': np.random.normal(9.5, 1.5, 20000), 'y': np.random.normal(15.5, 1.5, 20000), 'group': np.repeat('C',20000) })
    df=df.append(tmp1).append(tmp2)
    df.head(10)
    
    x y group
    0 11.529794 11.000711 A
    1 10.524043 11.541500 A
    2 9.845806 9.156706 A
    3 10.970836 9.428074 A
    4 10.748096 12.098970 A
    5 9.455139 8.636227 A
    6 8.094581 8.518158 A
    7 10.259945 9.168257 A
    8 9.420490 10.227326 A
    9 7.124481 9.170850 A
    # plot
    plt.plot( 'x', 'y', data=df, linestyle='', marker='o')
    
    # 设置x轴标签
    plt.xlabel('Value of X')
    
    # 设置y轴标签
    plt.ylabel('Value of Y')
    
    # 设置标题
    plt.title('Overplotting looks like that:', loc='left')
    
    image.png
    # 更改点的大小
    # Plot with small marker size
    plt.plot( 'x', 'y', data=df, linestyle='', marker='o', markersize=0.7)
    plt.xlabel('Value of X')
    plt.ylabel('Value of Y')
    plt.title('Overplotting? Try to reduce the dot size', loc='left')
    
    image.png
    # 设置点的透明度
    # Plot with transparency
    plt.plot( 'x', 'y', data=df, linestyle='', marker='o', markersize=3, alpha=0.05, color="red")
    
    # Titles
    plt.xlabel('Value of X')
    plt.ylabel('Value of Y')
    plt.title('Overplotting? Try to use transparency', loc='left')
    
    image.png
    # 随机取样
    # Sample 1000 random lines
    # 随机取100行数据
    df_sample=df.sample(1000)
    
    # Make the plot with this subset
    plt.plot( 'x', 'y', data=df_sample, linestyle='', marker='o')
    
    # titles
    plt.xlabel('Value of X')
    plt.ylabel('Value of Y')
    plt.title('Overplotting? Sample your data', loc='left')
    
    image.png

    使用seaborn包绘制散点图

    # library & dataset
    import seaborn as sns
    
    # 加载内置数据集
    df = sns.load_dataset('iris')
    
    # 查看示例数据
    df.head(5).append(df.tail(5))
    
    sepal_length sepal_width petal_length petal_width species
    0 5.1 3.5 1.4 0.2 setosa
    1 4.9 3.0 1.4 0.2 setosa
    2 4.7 3.2 1.3 0.2 setosa
    3 4.6 3.1 1.5 0.2 setosa
    4 5.0 3.6 1.4 0.2 setosa
    145 6.7 3.0 5.2 2.3 virginica
    146 6.3 2.5 5.0 1.9 virginica
    147 6.5 3.0 5.2 2.0 virginica
    148 6.2 3.4 5.4 2.3 virginica
    149 5.9 3.0 5.1 1.8 virginica

    使用regplot函数绘制散点图

    # 查看regplot的用法
    help(sns.regplot)
    
    regplot(x, y, data=None, x_estimator=None, x_bins=None, x_ci='ci', scatter=True, fit_reg=True, ci=95, n_boot=1000, units=None, order=1, logistic=False, lowess=False, robust=False, logx=False, x_partial=None, y_partial=None, truncate=False, dropna=True, x_jitter=None, y_jitter=None, label=None, color=None, marker='o', scatter_kws=None, line_kws=None, ax=None)
        Plot data and a linear regression model fit.
    
        There are a number of mutually exclusive options for estimating the
        regression model. See the :ref:`tutorial <regression_tutorial>` for more
        information.    
    
        Parameters
        ----------
        x, y: string, series, or vector array
            Input variables. If strings, these should correspond with column names
            in ``data``. When pandas objects are used, axes will be labeled with
            the series name.
        data : DataFrame
            Tidy ("long-form") dataframe where each column is a variable and each
            row is an observation.    
        x_estimator : callable that maps vector -> scalar, optional
            Apply this function to each unique value of ``x`` and plot the
            resulting estimate. This is useful when ``x`` is a discrete variable.
            If ``x_ci`` is given, this estimate will be bootstrapped and a
            confidence interval will be drawn.    
        x_bins : int or vector, optional
            Bin the ``x`` variable into discrete bins and then estimate the central
            tendency and a confidence interval. This binning only influences how
            the scatterplot is drawn; the regression is still fit to the original
            data.  This parameter is interpreted either as the number of
            evenly-sized (not necessary spaced) bins or the positions of the bin
            centers. When this parameter is used, it implies that the default of
            ``x_estimator`` is ``numpy.mean``.    
        x_ci : "ci", "sd", int in [0, 100] or None, optional
            Size of the confidence interval used when plotting a central tendency
            for discrete values of ``x``. If ``"ci"``, defer to the value of the
            ``ci`` parameter. If ``"sd"``, skip bootstrapping and show the
            standard deviation of the observations in each bin.    
        scatter : bool, optional
            If ``True``, draw a scatterplot with the underlying observations (or
            the ``x_estimator`` values).    
        fit_reg : bool, optional
            If ``True``, estimate and plot a regression model relating the ``x``
            and ``y`` variables.    
        ci : int in [0, 100] or None, optional
            Size of the confidence interval for the regression estimate. This will
            be drawn using translucent bands around the regression line. The
            confidence interval is estimated using a bootstrap; for large
            datasets, it may be advisable to avoid that computation by setting
            this parameter to None.    
        n_boot : int, optional
            Number of bootstrap resamples used to estimate the ``ci``. The default
            value attempts to balance time and stability; you may want to increase
            this value for "final" versions of plots.    
        units : variable name in ``data``, optional
            If the ``x`` and ``y`` observations are nested within sampling units,
            those can be specified here. This will be taken into account when
            computing the confidence intervals by performing a multilevel bootstrap
            that resamples both units and observations (within unit). This does not
            otherwise influence how the regression is estimated or drawn.    
        order : int, optional
            If ``order`` is greater than 1, use ``numpy.polyfit`` to estimate a
            polynomial regression.    
        logistic : bool, optional
            If ``True``, assume that ``y`` is a binary variable and use
            ``statsmodels`` to estimate a logistic regression model. Note that this
            is substantially more computationally intensive than linear regression,
            so you may wish to decrease the number of bootstrap resamples
            (``n_boot``) or set ``ci`` to None.    
        lowess : bool, optional
            If ``True``, use ``statsmodels`` to estimate a nonparametric lowess
            model (locally weighted linear regression). Note that confidence
            intervals cannot currently be drawn for this kind of model.    
        robust : bool, optional
            If ``True``, use ``statsmodels`` to estimate a robust regression. This
            will de-weight outliers. Note that this is substantially more
            computationally intensive than standard linear regression, so you may
            wish to decrease the number of bootstrap resamples (``n_boot``) or set
            ``ci`` to None.    
        logx : bool, optional
            If ``True``, estimate a linear regression of the form y ~ log(x), but
            plot the scatterplot and regression model in the input space. Note that
            ``x`` must be positive for this to work.    
        {x,y}_partial : strings in ``data`` or matrices
            Confounding variables to regress out of the ``x`` or ``y`` variables
            before plotting.    
        truncate : bool, optional
            By default, the regression line is drawn to fill the x axis limits
            after the scatterplot is drawn. If ``truncate`` is ``True``, it will
            instead by bounded by the data limits.    
        {x,y}_jitter : floats, optional
            Add uniform random noise of this size to either the ``x`` or ``y``
            variables. The noise is added to a copy of the data after fitting the
            regression, and only influences the look of the scatterplot. This can
            be helpful when plotting variables that take discrete values.    
        label : string
            Label to apply to ether the scatterplot or regression line (if
            ``scatter`` is ``False``) for use in a legend.
        color : matplotlib color
            Color to apply to all plot elements; will be superseded by colors
            passed in ``scatter_kws`` or ``line_kws``.
        marker : matplotlib marker code
            Marker to use for the scatterplot glyphs.
        {scatter,line}_kws : dictionaries
            Additional keyword arguments to pass to ``plt.scatter`` and
            ``plt.plot``.    
        ax : matplotlib Axes, optional
            Axes object to draw the plot onto, otherwise uses the current Axes.
    
        Returns
        -------
        ax : matplotlib Axes
            The Axes object containing the plot.
    
        See Also
        --------
        lmplot : Combine :func:`regplot` and :class:`FacetGrid` to plot multiple
                 linear relationships in a dataset.
        jointplot : Combine :func:`regplot` and :class:`JointGrid` (when used with
                    ``kind="reg"``).
        pairplot : Combine :func:`regplot` and :class:`PairGrid` (when used with
                   ``kind="reg"``).
        residplot : Plot the residuals of a linear regression model.
    
        Notes
        -----
    
        The :func:`regplot` and :func:`lmplot` functions are closely related, but
        the former is an axes-level function while the latter is a figure-level
        function that combines :func:`regplot` and :class:`FacetGrid`.    
    
        It's also easy to combine combine :func:`regplot` and :class:`JointGrid` or
        :class:`PairGrid` through the :func:`jointplot` and :func:`pairplot`
        functions, although these do not directly accept all of :func:`regplot`'s
        parameters.
    
        Examples
        --------
    
        Plot the relationship between two variables in a DataFrame:
    
        .. plot::
            :context: close-figs
    
            >>> import seaborn as sns; sns.set(color_codes=True)
            >>> tips = sns.load_dataset("tips")
            >>> ax = sns.regplot(x="total_bill", y="tip", data=tips)
    
        Plot with two variables defined as numpy arrays; use a different color:
    
        .. plot::
            :context: close-figs
    
            >>> import numpy as np; np.random.seed(8)
            >>> mean, cov = [4, 6], [(1.5, .7), (.7, 1)]
            >>> x, y = np.random.multivariate_normal(mean, cov, 80).T
            >>> ax = sns.regplot(x=x, y=y, color="g")
    
        Plot with two variables defined as pandas Series; use a different marker:
    
        .. plot::
            :context: close-figs
    
            >>> import pandas as pd
            >>> x, y = pd.Series(x, name="x_var"), pd.Series(y, name="y_var")
            >>> ax = sns.regplot(x=x, y=y, marker="+")
    
        Use a 68% confidence interval, which corresponds with the standard error
        of the estimate:
    
        .. plot::
            :context: close-figs
    
            >>> ax = sns.regplot(x=x, y=y, ci=68)
    
        Plot with a discrete ``x`` variable and add some jitter:
    
        .. plot::
            :context: close-figs
    
            >>> ax = sns.regplot(x="size", y="total_bill", data=tips, x_jitter=.1)
    
        Plot with a discrete ``x`` variable showing means and confidence intervals
        for unique values:
    
        .. plot::
            :context: close-figs
    
            >>> ax = sns.regplot(x="size", y="total_bill", data=tips,
            ...                  x_estimator=np.mean)
    
        Plot with a continuous variable divided into discrete bins:
    
        .. plot::
            :context: close-figs
    
            >>> ax = sns.regplot(x=x, y=y, x_bins=4)
    
        Fit a higher-order polynomial regression and truncate the model prediction:
    
        .. plot::
            :context: close-figs
    
            >>> ans = sns.load_dataset("anscombe")
            >>> ax = sns.regplot(x="x", y="y", data=ans.loc[ans.dataset == "II"],
            ...                  scatter_kws={"s": 80},
            ...                  order=2, ci=None, truncate=True)
    
        Fit a robust regression and don't plot a confidence interval:
    
        .. plot::
            :context: close-figs
    
            >>> ax = sns.regplot(x="x", y="y", data=ans.loc[ans.dataset == "III"],
            ...                  scatter_kws={"s": 80},
            ...                  robust=True, ci=None)
    
        Fit a logistic regression; jitter the y variable and use fewer bootstrap
        iterations:
    
        .. plot::
            :context: close-figs
    
            >>> tips["big_tip"] = (tips.tip / tips.total_bill) > .175
            >>> ax = sns.regplot(x="total_bill", y="big_tip", data=tips,
            ...                  logistic=True, n_boot=500, y_jitter=.03)
    
        Fit the regression model using log(x) and truncate the model prediction:
    
        .. plot::
            :context: close-figs
    
            >>> ax = sns.regplot(x="size", y="total_bill", data=tips,
            ...                  x_estimator=np.mean, logx=True, truncate=True)
    
    
    # 使用regplot函数绘制散点图
    sns.regplot(x=df["sepal_length"], y=df["sepal_width"])
    #sns.plt.show()
    
    image.png
    # Without regression fit:
    # 去掉回归线
    sns.regplot(x=df["sepal_length"], y=df["sepal_width"], fit_reg=False)
    #sns.plt.show()
    
    image.png
    # Change shape of marker
    # marker参数设置点的形状
    sns.regplot(x=df["sepal_length"], y=df["sepal_width"], marker="+", fit_reg=False)
    #sns.plt.show()
    
    image.png
    # More marker customization:
    # 使用scatter_kws参数设置点的颜色、透明度和大小
    sns.regplot(x=df["sepal_length"], y=df["sepal_width"], fit_reg=False, scatter_kws={"color":"darkred","alpha":0.3,"s":200} )
    #sns.plt.show()
    
    image.png

    使用lmplot函数绘制散点图

    # Use the 'hue' argument to provide a factor variable
    # hue参数设置分类变量颜色
    sns.lmplot( x="sepal_length", y="sepal_width", data=df, fit_reg=False, hue='species', legend=False)
    
    # Move the legend to an empty part of the plot
    plt.legend(loc='lower right')
    
    #sns.plt.show()
    
    image.png
    # give a list to the marker argument
    # markers参数设置点的形状
    sns.lmplot( x="sepal_length", y="sepal_width", data=df, fit_reg=False, hue='species', legend=False, markers=["o", "x", "1"])
    
    # Move the legend to an empty part of the plot
    plt.legend(loc='lower right')
    
    #sns.plt.show()
    
    image.png
    # Use the 'palette' argument
    # palette参数设置颜色画板
    sns.lmplot( x="sepal_length", y="sepal_width", data=df, fit_reg=False, hue='species', legend=True, palette="Set2")
    
    # Move the legend to an empty part of the plot
    #plt.legend(loc='lower right')
    
    #sns.plt.show()
    
    image.png

    使用jointplot函数绘制边际图

    # Custom the inside plot: options are: “scatter” | “reg” | “resid” | “kde” | “hex”
    # kind参数设置绘图类型
    sns.jointplot(x=df["sepal_length"], y=df["sepal_width"], kind='scatter')
    sns.jointplot(x=df["sepal_length"], y=df["sepal_width"], kind='hex')
    sns.jointplot(x=df["sepal_length"], y=df["sepal_width"], kind='kde')
    
    image.png image.png image.png
    # Then you can pass arguments to each type:
    # 设置点和线颜色,大小
    sns.jointplot(x=df["sepal_length"], y=df["sepal_width"], kind='scatter', s=200, color='m', edgecolor="skyblue", linewidth=2)
    
    # Custom the color
    sns.set(style="white", color_codes=True)
    sns.jointplot(x=df["sepal_length"], y=df["sepal_width"], kind='kde', color="skyblue")
    
    image.png image.png

    使用pairplot函数绘制配对散点图

    # first
    sns.pairplot(df, kind="scatter", hue="species", markers=["o", "s", "D"], palette="Set2")
    plt.show()
    
    # second: you can give other arguments with plot_kws.
    sns.pairplot(df, kind="scatter", hue="species", plot_kws=dict(s=80, edgecolor="white", linewidth=2.5))
    plt.show()
    
    image.png image.png

    参考来源:https://python-graph-gallery.com/scatter-plot/

    相关文章

      网友评论

        本文标题:Python数据可视化(一):散点图绘制

        本文链接:https://www.haomeiwen.com/subject/fkpcqltx.html