1 Subsetting by Columns
from pyspark.sql.functions import col
df_select = df.select(col('_c0'), col('_c1'), col('_c3'), col('_c9'))
df_select.show(5)
+------------+----------+-----+-----+
| _c0| _c1| _c3| _c9|
+------------+----------+-----+-----+
|100002091588|01/01/2015|4.125|16740|
|100002091588|02/01/2015|4.125|16740|
|100002091588|03/01/2015|4.125|16740|
|100002091588|04/01/2015|4.125|16740|
|100002091588|05/01/2015|4.125|16740|
+------------+----------+-----+-----+
df_select = df[['_c0', '_c1', '_c3', '_c9']]
df_select.show(5)
+------------+----------+-----+-----+
| _c0| _c1| _c3| _c9|
+------------+----------+-----+-----+
|100002091588|01/01/2015|4.125|16740|
|100002091588|02/01/2015|4.125|16740|
|100002091588|03/01/2015|4.125|16740|
|100002091588|04/01/2015|4.125|16740|
|100002091588|05/01/2015|4.125|16740|
+------------+----------+-----+-----+
df_drop = df_select.drop(col('_c3'))
df_drop.show(5)
+------------+----------+-----+
| _c0| _c1| _c9|
+------------+----------+-----+
|100002091588|01/01/2015|16740|
|100002091588|02/01/2015|16740|
|100002091588|03/01/2015|16740|
|100002091588|04/01/2015|16740|
|100002091588|05/01/2015|16740|
+------------+----------+-----+
2 Subsetting by Rows
df.describe('_c6').show()
+-------+-----------------+
|summary| _c6|
+-------+-----------------+
| count| 3526154|
| mean|354.7084951479714|
| stddev| 4.01181251079202|
| min| 292|
| max| 480|
+-------+-----------------+
df_sub = df.where(df['_c6'] < 358)
df_sub.describe('_c6').show()
+-------+------------------+
|summary| _c6|
+-------+------------------+
| count| 2598037|
| mean|353.15604897081914|
| stddev|3.5170213056883983|
| min| 292|
| max| 357|
+-------+------------------+
df_filter = df.where((df['_c6'] > 340) & (df['_c5'] < 4))
df_filter.describe('_c6', '_c5').show()
+-------+------------------+------------------+
|summary| _c6| _c5|
+-------+------------------+------------------+
| count| 1254131| 1254131|
| mean|358.48713810598736| 1.474693632483369|
| stddev| 1.378961910349754|1.2067831502138422|
| min| 341| -1|
| max| 361| 3|
+-------+------------------+------------------+
3 Random Sampling
df_sample = df.sample(False, 0.05, 99)
df_sample.describe('_c6').show()
+-------+------------------+
|summary| _c6|
+-------+------------------+
| count| 176015|
| mean|354.69058318893275|
| stddev| 4.028614501676224|
| min| 293|
| max| 361|
+-------+------------------+
网友评论