pandas1
import pandas
food_info = pandas.read_csv("food_info.csv")
print(type(food_info))
print (food_info.dtypes)
<class 'pandas.core.frame.DataFrame'>
NDB_No int64
Shrt_Desc object
Water_(g) float64
Energ_Kcal int64
Protein_(g) float64
Lipid_Tot_(g) float64
Ash_(g) float64
Carbohydrt_(g) float64
Fiber_TD_(g) float64
Sugar_Tot_(g) float64
Calcium_(mg) float64
Iron_(mg) float64
Magnesium_(mg) float64
Phosphorus_(mg) float64
Potassium_(mg) float64
Sodium_(mg) float64
Zinc_(mg) float64
Copper_(mg) float64
Manganese_(mg) float64
Selenium_(mcg) float64
Vit_C_(mg) float64
Thiamin_(mg) float64
Riboflavin_(mg) float64
Niacin_(mg) float64
Vit_B6_(mg) float64
Vit_B12_(mcg) float64
Vit_A_IU float64
Vit_A_RAE float64
Vit_E_(mg) float64
Vit_D_mcg float64
Vit_D_IU float64
Vit_K_(mcg) float64
FA_Sat_(g) float64
FA_Mono_(g) float64
FA_Poly_(g) float64
Cholestrl_(mg) float64
dtype: object
first_rows = food_info.head()
print (first_rows)
print (food_info.head(3))
print (food_info.columns)
print (food_info.shape)
NDB_No Shrt_Desc Water_(g) Energ_Kcal Protein_(g) \
0 1001 BUTTER WITH SALT 15.87 717 0.85
1 1002 BUTTER WHIPPED WITH SALT 15.87 717 0.85
2 1003 BUTTER OIL ANHYDROUS 0.24 876 0.28
3 1004 CHEESE BLUE 42.41 353 21.40
4 1005 CHEESE BRICK 41.11 371 23.24
Lipid_Tot_(g) Ash_(g) Carbohydrt_(g) Fiber_TD_(g) Sugar_Tot_(g) ... \
0 81.11 2.11 0.06 0.0 0.06 ...
1 81.11 2.11 0.06 0.0 0.06 ...
2 99.48 0.00 0.00 0.0 0.00 ...
3 28.74 5.11 2.34 0.0 0.50 ...
4 29.68 3.18 2.79 0.0 0.51 ...
Vit_A_IU Vit_A_RAE Vit_E_(mg) Vit_D_mcg Vit_D_IU Vit_K_(mcg) \
0 2499.0 684.0 2.32 1.5 60.0 7.0
1 2499.0 684.0 2.32 1.5 60.0 7.0
2 3069.0 840.0 2.80 1.8 73.0 8.6
3 721.0 198.0 0.25 0.5 21.0 2.4
4 1080.0 292.0 0.26 0.5 22.0 2.5
FA_Sat_(g) FA_Mono_(g) FA_Poly_(g) Cholestrl_(mg)
0 51.368 21.021 3.043 215.0
1 50.489 23.426 3.012 219.0
2 61.924 28.732 3.694 256.0
3 18.669 7.778 0.800 75.0
4 18.764 8.598 0.784 94.0
[5 rows x 36 columns]
NDB_No Shrt_Desc Water_(g) Energ_Kcal Protein_(g) \
0 1001 BUTTER WITH SALT 15.87 717 0.85
1 1002 BUTTER WHIPPED WITH SALT 15.87 717 0.85
2 1003 BUTTER OIL ANHYDROUS 0.24 876 0.28
Lipid_Tot_(g) Ash_(g) Carbohydrt_(g) Fiber_TD_(g) Sugar_Tot_(g) ... \
0 81.11 2.11 0.06 0.0 0.06 ...
1 81.11 2.11 0.06 0.0 0.06 ...
2 99.48 0.00 0.00 0.0 0.00 ...
Vit_A_IU Vit_A_RAE Vit_E_(mg) Vit_D_mcg Vit_D_IU Vit_K_(mcg) \
0 2499.0 684.0 2.32 1.5 60.0 7.0
1 2499.0 684.0 2.32 1.5 60.0 7.0
2 3069.0 840.0 2.80 1.8 73.0 8.6
FA_Sat_(g) FA_Mono_(g) FA_Poly_(g) Cholestrl_(mg)
0 51.368 21.021 3.043 215.0
1 50.489 23.426 3.012 219.0
2 61.924 28.732 3.694 256.0
[3 rows x 36 columns]
Index(['NDB_No', 'Shrt_Desc', 'Water_(g)', 'Energ_Kcal', 'Protein_(g)',
'Lipid_Tot_(g)', 'Ash_(g)', 'Carbohydrt_(g)', 'Fiber_TD_(g)',
'Sugar_Tot_(g)', 'Calcium_(mg)', 'Iron_(mg)', 'Magnesium_(mg)',
'Phosphorus_(mg)', 'Potassium_(mg)', 'Sodium_(mg)', 'Zinc_(mg)',
'Copper_(mg)', 'Manganese_(mg)', 'Selenium_(mcg)', 'Vit_C_(mg)',
'Thiamin_(mg)', 'Riboflavin_(mg)', 'Niacin_(mg)', 'Vit_B6_(mg)',
'Vit_B12_(mcg)', 'Vit_A_IU', 'Vit_A_RAE', 'Vit_E_(mg)', 'Vit_D_mcg',
'Vit_D_IU', 'Vit_K_(mcg)', 'FA_Sat_(g)', 'FA_Mono_(g)', 'FA_Poly_(g)',
'Cholestrl_(mg)'],
dtype='object')
(8618, 36)
#pandas uses zero-indexing
#Series object representing the row at index 0.
print (food_info.loc[0])
# Series object representing the seventh row.
food_info.loc[6]
# Will throw an error: "KeyError: 'the label [8620] is not in the [index]'"
food_info.loc[8617]
#The object dtype is equivalent to a string in Python
NDB_No 1001
Shrt_Desc BUTTER WITH SALT
Water_(g) 15.87
Energ_Kcal 717
Protein_(g) 0.85
Lipid_Tot_(g) 81.11
Ash_(g) 2.11
Carbohydrt_(g) 0.06
Fiber_TD_(g) 0
Sugar_Tot_(g) 0.06
Calcium_(mg) 24
Iron_(mg) 0.02
Magnesium_(mg) 2
Phosphorus_(mg) 24
Potassium_(mg) 24
Sodium_(mg) 643
Zinc_(mg) 0.09
Copper_(mg) 0
Manganese_(mg) 0
Selenium_(mcg) 1
Vit_C_(mg) 0
Thiamin_(mg) 0.005
Riboflavin_(mg) 0.034
Niacin_(mg) 0.042
Vit_B6_(mg) 0.003
Vit_B12_(mcg) 0.17
Vit_A_IU 2499
Vit_A_RAE 684
Vit_E_(mg) 2.32
Vit_D_mcg 1.5
Vit_D_IU 60
Vit_K_(mcg) 7
FA_Sat_(g) 51.368
FA_Mono_(g) 21.021
FA_Poly_(g) 3.043
Cholestrl_(mg) 215
Name: 0, dtype: object
NDB_No 93600
Shrt_Desc TURTLE GREEN RAW
Water_(g) 78.5
Energ_Kcal 89
Protein_(g) 19.8
Lipid_Tot_(g) 0.5
Ash_(g) 1.2
Carbohydrt_(g) 0
Fiber_TD_(g) 0
Sugar_Tot_(g) 0
Calcium_(mg) 118
Iron_(mg) 1.4
Magnesium_(mg) 20
Phosphorus_(mg) 180
Potassium_(mg) 230
Sodium_(mg) 68
Zinc_(mg) 1
Copper_(mg) 0.25
Manganese_(mg) NaN
Selenium_(mcg) 16.8
Vit_C_(mg) 0
Thiamin_(mg) 0.12
Riboflavin_(mg) 0.15
Niacin_(mg) 1.1
Vit_B6_(mg) 0.12
Vit_B12_(mcg) 1
Vit_A_IU 100
Vit_A_RAE 30
Vit_E_(mg) 0.5
Vit_D_mcg 0
Vit_D_IU 0
Vit_K_(mcg) 0.1
FA_Sat_(g) 0.127
FA_Mono_(g) 0.088
FA_Poly_(g) 0.17
Cholestrl_(mg) 50
Name: 8617, dtype: object
#object - For string values
#int - For integer values
#float - For float values
#datetime - For time values
#bool - For Boolean values
print(food_info.dtypes)
NDB_No int64
Shrt_Desc object
Water_(g) float64
Energ_Kcal int64
Protein_(g) float64
Lipid_Tot_(g) float64
Ash_(g) float64
Carbohydrt_(g) float64
Fiber_TD_(g) float64
Sugar_Tot_(g) float64
Calcium_(mg) float64
Iron_(mg) float64
Magnesium_(mg) float64
Phosphorus_(mg) float64
Potassium_(mg) float64
Sodium_(mg) float64
Zinc_(mg) float64
Copper_(mg) float64
Manganese_(mg) float64
Selenium_(mcg) float64
Vit_C_(mg) float64
Thiamin_(mg) float64
Riboflavin_(mg) float64
Niacin_(mg) float64
Vit_B6_(mg) float64
Vit_B12_(mcg) float64
Vit_A_IU float64
Vit_A_RAE float64
Vit_E_(mg) float64
Vit_D_mcg float64
Vit_D_IU float64
Vit_K_(mcg) float64
FA_Sat_(g) float64
FA_Mono_(g) float64
FA_Poly_(g) float64
Cholestrl_(mg) float64
dtype: object
# Returns a DataFrame containing the rows at indexes 3, 4, 5, and 6.
food_info.loc[3:6]
# Returns a DataFrame containing the rows at indexes 2, 5, and 10. Either of the following approaches will work.
# Method 1
two_five_ten = [2,5,10]
food_info.loc[two_five_ten]
# Method 2
food_info.loc[[2,5,10]]
NDB_No Shrt_Desc Water_(g) Energ_Kcal Protein_(g) Lipid_Tot_(g) Ash_(g) Carbohydrt_(g) Fiber_TD_(g) Sugar_Tot_(g) ... Vit_A_IU Vit_A_RAE Vit_E_(mg) Vit_D_mcg Vit_D_IU Vit_K_(mcg) FA_Sat_(g) FA_Mono_(g) FA_Poly_(g) Cholestrl_(mg)
2 1003 BUTTER OIL ANHYDROUS 0.24 876 0.28 99.48 0.00 0.00 0.0 0.00 ... 3069.0 840.0 2.80 1.8 73.0 8.6 61.924 28.732 3.694 256.0
5 1006 CHEESE BRIE 48.42 334 20.75 27.68 2.70 0.45 0.0 0.45 ... 592.0 174.0 0.24 0.5 20.0 2.3 17.410 8.013 0.826 100.0
10 1011 CHEESE COLBY 38.20 394 23.76 32.11 3.36 2.57 0.0 0.52 ... 994.0 264.0 0.28 0.6 24.0 2.7 20.218 9.280 0.953 95.0
3 rows × 36 columns
ndb_col
# Series object representing the "NDB_No" column.
ndb_col = food_info["NDB_No"]
print (ndb_col)
# Alternatively, you can access a column by passing in a string variable.
col_name = "NDB_No"
ndb_col = food_info[col_name]
0 1001
1 1002
2 1003
3 1004
4 1005
...
8613 83110
8614 90240
8615 90480
8616 90560
8617 93600
Name: NDB_No, Length: 8618, dtype: int64
columns = ["Zinc_(mg)", "Copper_(mg)"]
zinc_copper = food_info[columns]
print (zinc_copper)
# Skipping the assignment.
zinc_copper = food_info[["Zinc_(mg)", "Copper_(mg)"]]
Zinc_(mg) Copper_(mg)
0 0.09 0.000
1 0.05 0.016
2 0.01 0.001
3 2.66 0.040
4 2.60 0.024
... ... ...
8613 1.10 0.100
8614 1.55 0.033
8615 0.19 0.020
8616 1.00 0.400
8617 1.00 0.250
[8618 rows x 2 columns]
print(food_info.columns)
print(food_info.head(2))
col_names = food_info.columns.tolist()
#print col_names
gram_columns = []
for c in col_names:
if c.endswith("(g)"):
gram_columns.append(c)
gram_df = food_info[gram_columns]
print(gram_df.head(3))
Index(['NDB_No', 'Shrt_Desc', 'Water_(g)', 'Energ_Kcal', 'Protein_(g)',
'Lipid_Tot_(g)', 'Ash_(g)', 'Carbohydrt_(g)', 'Fiber_TD_(g)',
'Sugar_Tot_(g)', 'Calcium_(mg)', 'Iron_(mg)', 'Magnesium_(mg)',
'Phosphorus_(mg)', 'Potassium_(mg)', 'Sodium_(mg)', 'Zinc_(mg)',
'Copper_(mg)', 'Manganese_(mg)', 'Selenium_(mcg)', 'Vit_C_(mg)',
'Thiamin_(mg)', 'Riboflavin_(mg)', 'Niacin_(mg)', 'Vit_B6_(mg)',
'Vit_B12_(mcg)', 'Vit_A_IU', 'Vit_A_RAE', 'Vit_E_(mg)', 'Vit_D_mcg',
'Vit_D_IU', 'Vit_K_(mcg)', 'FA_Sat_(g)', 'FA_Mono_(g)', 'FA_Poly_(g)',
'Cholestrl_(mg)'],
dtype='object')
NDB_No Shrt_Desc Water_(g) Energ_Kcal Protein_(g) \
0 1001 BUTTER WITH SALT 15.87 717 0.85
1 1002 BUTTER WHIPPED WITH SALT 15.87 717 0.85
Lipid_Tot_(g) Ash_(g) Carbohydrt_(g) Fiber_TD_(g) Sugar_Tot_(g) ... \
0 81.11 2.11 0.06 0.0 0.06 ...
1 81.11 2.11 0.06 0.0 0.06 ...
Vit_A_IU Vit_A_RAE Vit_E_(mg) Vit_D_mcg Vit_D_IU Vit_K_(mcg) \
0 2499.0 684.0 2.32 1.5 60.0 7.0
1 2499.0 684.0 2.32 1.5 60.0 7.0
FA_Sat_(g) FA_Mono_(g) FA_Poly_(g) Cholestrl_(mg)
0 51.368 21.021 3.043 215.0
1 50.489 23.426 3.012 219.0
[2 rows x 36 columns]
Water_(g) Protein_(g) Lipid_Tot_(g) Ash_(g) Carbohydrt_(g) \
0 15.87 0.85 81.11 2.11 0.06
1 15.87 0.85 81.11 2.11 0.06
2 0.24 0.28 99.48 0.00 0.00
Fiber_TD_(g) Sugar_Tot_(g) FA_Sat_(g) FA_Mono_(g) FA_Poly_(g)
0 0.0 0.06 51.368 21.021 3.043
1 0.0 0.06 50.489 23.426 3.012
2 0.0 0.00 61.924 28.732 3.694
pandas2
import pandas
food_info = pandas.read_csv("food_info.csv")
print(type(food_info))
print (food_info.dtypes)
<class 'pandas.core.frame.DataFrame'>
NDB_No int64
Shrt_Desc object
Water_(g) float64
Energ_Kcal int64
Protein_(g) float64
Lipid_Tot_(g) float64
Ash_(g) float64
Carbohydrt_(g) float64
Fiber_TD_(g) float64
Sugar_Tot_(g) float64
Calcium_(mg) float64
Iron_(mg) float64
Magnesium_(mg) float64
Phosphorus_(mg) float64
Potassium_(mg) float64
Sodium_(mg) float64
Zinc_(mg) float64
Copper_(mg) float64
Manganese_(mg) float64
Selenium_(mcg) float64
Vit_C_(mg) float64
Thiamin_(mg) float64
Riboflavin_(mg) float64
Niacin_(mg) float64
Vit_B6_(mg) float64
Vit_B12_(mcg) float64
Vit_A_IU float64
Vit_A_RAE float64
Vit_E_(mg) float64
Vit_D_mcg float64
Vit_D_IU float64
Vit_K_(mcg) float64
FA_Sat_(g) float64
FA_Mono_(g) float64
FA_Poly_(g) float64
Cholestrl_(mg) float64
dtype: object
first_rows = food_info.head()
print (first_rows)
print (food_info.head(3))
print (food_info.columns)
print (food_info.shape)
NDB_No Shrt_Desc Water_(g) Energ_Kcal Protein_(g) \
0 1001 BUTTER WITH SALT 15.87 717 0.85
1 1002 BUTTER WHIPPED WITH SALT 15.87 717 0.85
2 1003 BUTTER OIL ANHYDROUS 0.24 876 0.28
3 1004 CHEESE BLUE 42.41 353 21.40
4 1005 CHEESE BRICK 41.11 371 23.24
Lipid_Tot_(g) Ash_(g) Carbohydrt_(g) Fiber_TD_(g) Sugar_Tot_(g) ... \
0 81.11 2.11 0.06 0.0 0.06 ...
1 81.11 2.11 0.06 0.0 0.06 ...
2 99.48 0.00 0.00 0.0 0.00 ...
3 28.74 5.11 2.34 0.0 0.50 ...
4 29.68 3.18 2.79 0.0 0.51 ...
Vit_A_IU Vit_A_RAE Vit_E_(mg) Vit_D_mcg Vit_D_IU Vit_K_(mcg) \
0 2499.0 684.0 2.32 1.5 60.0 7.0
1 2499.0 684.0 2.32 1.5 60.0 7.0
2 3069.0 840.0 2.80 1.8 73.0 8.6
3 721.0 198.0 0.25 0.5 21.0 2.4
4 1080.0 292.0 0.26 0.5 22.0 2.5
FA_Sat_(g) FA_Mono_(g) FA_Poly_(g) Cholestrl_(mg)
0 51.368 21.021 3.043 215.0
1 50.489 23.426 3.012 219.0
2 61.924 28.732 3.694 256.0
3 18.669 7.778 0.800 75.0
4 18.764 8.598 0.784 94.0
[5 rows x 36 columns]
NDB_No Shrt_Desc Water_(g) Energ_Kcal Protein_(g) \
0 1001 BUTTER WITH SALT 15.87 717 0.85
1 1002 BUTTER WHIPPED WITH SALT 15.87 717 0.85
2 1003 BUTTER OIL ANHYDROUS 0.24 876 0.28
Lipid_Tot_(g) Ash_(g) Carbohydrt_(g) Fiber_TD_(g) Sugar_Tot_(g) ... \
0 81.11 2.11 0.06 0.0 0.06 ...
1 81.11 2.11 0.06 0.0 0.06 ...
2 99.48 0.00 0.00 0.0 0.00 ...
Vit_A_IU Vit_A_RAE Vit_E_(mg) Vit_D_mcg Vit_D_IU Vit_K_(mcg) \
0 2499.0 684.0 2.32 1.5 60.0 7.0
1 2499.0 684.0 2.32 1.5 60.0 7.0
2 3069.0 840.0 2.80 1.8 73.0 8.6
FA_Sat_(g) FA_Mono_(g) FA_Poly_(g) Cholestrl_(mg)
0 51.368 21.021 3.043 215.0
1 50.489 23.426 3.012 219.0
2 61.924 28.732 3.694 256.0
[3 rows x 36 columns]
Index(['NDB_No', 'Shrt_Desc', 'Water_(g)', 'Energ_Kcal', 'Protein_(g)',
'Lipid_Tot_(g)', 'Ash_(g)', 'Carbohydrt_(g)', 'Fiber_TD_(g)',
'Sugar_Tot_(g)', 'Calcium_(mg)', 'Iron_(mg)', 'Magnesium_(mg)',
'Phosphorus_(mg)', 'Potassium_(mg)', 'Sodium_(mg)', 'Zinc_(mg)',
'Copper_(mg)', 'Manganese_(mg)', 'Selenium_(mcg)', 'Vit_C_(mg)',
'Thiamin_(mg)', 'Riboflavin_(mg)', 'Niacin_(mg)', 'Vit_B6_(mg)',
'Vit_B12_(mcg)', 'Vit_A_IU', 'Vit_A_RAE', 'Vit_E_(mg)', 'Vit_D_mcg',
'Vit_D_IU', 'Vit_K_(mcg)', 'FA_Sat_(g)', 'FA_Mono_(g)', 'FA_Poly_(g)',
'Cholestrl_(mg)'],
dtype='object')
(8618, 36)
#pandas uses zero-indexing
#Series object representing the row at index 0.
print (food_info.loc[0])
# Series object representing the seventh row.
food_info.loc[6]
# Will throw an error: "KeyError: 'the label [8620] is not in the [index]'"
food_info.loc[8617]
#The object dtype is equivalent to a string in Python
NDB_No 1001
Shrt_Desc BUTTER WITH SALT
Water_(g) 15.87
Energ_Kcal 717
Protein_(g) 0.85
Lipid_Tot_(g) 81.11
Ash_(g) 2.11
Carbohydrt_(g) 0.06
Fiber_TD_(g) 0
Sugar_Tot_(g) 0.06
Calcium_(mg) 24
Iron_(mg) 0.02
Magnesium_(mg) 2
Phosphorus_(mg) 24
Potassium_(mg) 24
Sodium_(mg) 643
Zinc_(mg) 0.09
Copper_(mg) 0
Manganese_(mg) 0
Selenium_(mcg) 1
Vit_C_(mg) 0
Thiamin_(mg) 0.005
Riboflavin_(mg) 0.034
Niacin_(mg) 0.042
Vit_B6_(mg) 0.003
Vit_B12_(mcg) 0.17
Vit_A_IU 2499
Vit_A_RAE 684
Vit_E_(mg) 2.32
Vit_D_mcg 1.5
Vit_D_IU 60
Vit_K_(mcg) 7
FA_Sat_(g) 51.368
FA_Mono_(g) 21.021
FA_Poly_(g) 3.043
Cholestrl_(mg) 215
Name: 0, dtype: object
NDB_No 93600
Shrt_Desc TURTLE GREEN RAW
Water_(g) 78.5
Energ_Kcal 89
Protein_(g) 19.8
Lipid_Tot_(g) 0.5
Ash_(g) 1.2
Carbohydrt_(g) 0
Fiber_TD_(g) 0
Sugar_Tot_(g) 0
Calcium_(mg) 118
Iron_(mg) 1.4
Magnesium_(mg) 20
Phosphorus_(mg) 180
Potassium_(mg) 230
Sodium_(mg) 68
Zinc_(mg) 1
Copper_(mg) 0.25
Manganese_(mg) NaN
Selenium_(mcg) 16.8
Vit_C_(mg) 0
Thiamin_(mg) 0.12
Riboflavin_(mg) 0.15
Niacin_(mg) 1.1
Vit_B6_(mg) 0.12
Vit_B12_(mcg) 1
Vit_A_IU 100
Vit_A_RAE 30
Vit_E_(mg) 0.5
Vit_D_mcg 0
Vit_D_IU 0
Vit_K_(mcg) 0.1
FA_Sat_(g) 0.127
FA_Mono_(g) 0.088
FA_Poly_(g) 0.17
Cholestrl_(mg) 50
Name: 8617, dtype: object
#object - For string values
#int - For integer values
#float - For float values
#datetime - For time values
#bool - For Boolean values
print(food_info.dtypes)
NDB_No int64
Shrt_Desc object
Water_(g) float64
Energ_Kcal int64
Protein_(g) float64
Lipid_Tot_(g) float64
Ash_(g) float64
Carbohydrt_(g) float64
Fiber_TD_(g) float64
Sugar_Tot_(g) float64
Calcium_(mg) float64
Iron_(mg) float64
Magnesium_(mg) float64
Phosphorus_(mg) float64
Potassium_(mg) float64
Sodium_(mg) float64
Zinc_(mg) float64
Copper_(mg) float64
Manganese_(mg) float64
Selenium_(mcg) float64
Vit_C_(mg) float64
Thiamin_(mg) float64
Riboflavin_(mg) float64
Niacin_(mg) float64
Vit_B6_(mg) float64
Vit_B12_(mcg) float64
Vit_A_IU float64
Vit_A_RAE float64
Vit_E_(mg) float64
Vit_D_mcg float64
Vit_D_IU float64
Vit_K_(mcg) float64
FA_Sat_(g) float64
FA_Mono_(g) float64
FA_Poly_(g) float64
Cholestrl_(mg) float64
dtype: object
# Returns a DataFrame containing the rows at indexes 3, 4, 5, and 6.
food_info.loc[3:6]
# Returns a DataFrame containing the rows at indexes 2, 5, and 10. Either of the following approaches will work.
# Method 1
two_five_ten = [2,5,10]
food_info.loc[two_five_ten]
# Method 2
food_info.loc[[2,5,10]]
NDB_No Shrt_Desc Water_(g) Energ_Kcal Protein_(g) Lipid_Tot_(g) Ash_(g) Carbohydrt_(g) Fiber_TD_(g) Sugar_Tot_(g) ... Vit_A_IU Vit_A_RAE Vit_E_(mg) Vit_D_mcg Vit_D_IU Vit_K_(mcg) FA_Sat_(g) FA_Mono_(g) FA_Poly_(g) Cholestrl_(mg)
2 1003 BUTTER OIL ANHYDROUS 0.24 876 0.28 99.48 0.00 0.00 0.0 0.00 ... 3069.0 840.0 2.80 1.8 73.0 8.6 61.924 28.732 3.694 256.0
5 1006 CHEESE BRIE 48.42 334 20.75 27.68 2.70 0.45 0.0 0.45 ... 592.0 174.0 0.24 0.5 20.0 2.3 17.410 8.013 0.826 100.0
10 1011 CHEESE COLBY 38.20 394 23.76 32.11 3.36 2.57 0.0 0.52 ... 994.0 264.0 0.28 0.6 24.0 2.7 20.218 9.280 0.953 95.0
3 rows × 36 columns
ndb_col
# Series object representing the "NDB_No" column.
ndb_col = food_info["NDB_No"]
print (ndb_col)
# Alternatively, you can access a column by passing in a string variable.
col_name = "NDB_No"
ndb_col = food_info[col_name]
0 1001
1 1002
2 1003
3 1004
4 1005
...
8613 83110
8614 90240
8615 90480
8616 90560
8617 93600
Name: NDB_No, Length: 8618, dtype: int64
columns = ["Zinc_(mg)", "Copper_(mg)"]
zinc_copper = food_info[columns]
print (zinc_copper)
# Skipping the assignment.
zinc_copper = food_info[["Zinc_(mg)", "Copper_(mg)"]]
Zinc_(mg) Copper_(mg)
0 0.09 0.000
1 0.05 0.016
2 0.01 0.001
3 2.66 0.040
4 2.60 0.024
... ... ...
8613 1.10 0.100
8614 1.55 0.033
8615 0.19 0.020
8616 1.00 0.400
8617 1.00 0.250
[8618 rows x 2 columns]
print(food_info.columns)
print(food_info.head(2))
col_names = food_info.columns.tolist()
#print col_names
gram_columns = []
for c in col_names:
if c.endswith("(g)"):
gram_columns.append(c)
gram_df = food_info[gram_columns]
print(gram_df.head(3))
Index(['NDB_No', 'Shrt_Desc', 'Water_(g)', 'Energ_Kcal', 'Protein_(g)',
'Lipid_Tot_(g)', 'Ash_(g)', 'Carbohydrt_(g)', 'Fiber_TD_(g)',
'Sugar_Tot_(g)', 'Calcium_(mg)', 'Iron_(mg)', 'Magnesium_(mg)',
'Phosphorus_(mg)', 'Potassium_(mg)', 'Sodium_(mg)', 'Zinc_(mg)',
'Copper_(mg)', 'Manganese_(mg)', 'Selenium_(mcg)', 'Vit_C_(mg)',
'Thiamin_(mg)', 'Riboflavin_(mg)', 'Niacin_(mg)', 'Vit_B6_(mg)',
'Vit_B12_(mcg)', 'Vit_A_IU', 'Vit_A_RAE', 'Vit_E_(mg)', 'Vit_D_mcg',
'Vit_D_IU', 'Vit_K_(mcg)', 'FA_Sat_(g)', 'FA_Mono_(g)', 'FA_Poly_(g)',
'Cholestrl_(mg)'],
dtype='object')
NDB_No Shrt_Desc Water_(g) Energ_Kcal Protein_(g) \
0 1001 BUTTER WITH SALT 15.87 717 0.85
1 1002 BUTTER WHIPPED WITH SALT 15.87 717 0.85
Lipid_Tot_(g) Ash_(g) Carbohydrt_(g) Fiber_TD_(g) Sugar_Tot_(g) ... \
0 81.11 2.11 0.06 0.0 0.06 ...
1 81.11 2.11 0.06 0.0 0.06 ...
Vit_A_IU Vit_A_RAE Vit_E_(mg) Vit_D_mcg Vit_D_IU Vit_K_(mcg) \
0 2499.0 684.0 2.32 1.5 60.0 7.0
1 2499.0 684.0 2.32 1.5 60.0 7.0
FA_Sat_(g) FA_Mono_(g) FA_Poly_(g) Cholestrl_(mg)
0 51.368 21.021 3.043 215.0
1 50.489 23.426 3.012 219.0
[2 rows x 36 columns]
Water_(g) Protein_(g) Lipid_Tot_(g) Ash_(g) Carbohydrt_(g) \
0 15.87 0.85 81.11 2.11 0.06
1 15.87 0.85 81.11 2.11 0.06
2 0.24 0.28 99.48 0.00 0.00
Fiber_TD_(g) Sugar_Tot_(g) FA_Sat_(g) FA_Mono_(g) FA_Poly_(g)
0 0.0 0.06 51.368 21.021 3.043
1 0.0 0.06 50.489 23.426 3.012
2 0.0 0.00 61.924 28.732 3.694
pandas3
import pandas as pd
import numpy as np
titanic_survival = pd.read_csv("titanic_train.csv")
titanic_survival.head()
PassengerId Survived Pclass Name Sex Age SibSp Parch Ticket Fare Cabin Embarked
0 1 0 3 Braund, Mr. Owen Harris male 22.0 1 0 A/5 21171 7.2500 NaN S
1 2 1 1 Cumings, Mrs. John Bradley (Florence Briggs Th... female 38.0 1 0 PC 17599 71.2833 C85 C
2 3 1 3 Heikkinen, Miss. Laina female 26.0 0 0 STON/O2. 3101282 7.9250 NaN S
3 4 1 1 Futrelle, Mrs. Jacques Heath (Lily May Peel) female 35.0 1 0 113803 53.1000 C123 S
4 5 0 3 Allen, Mr. William Henry male 35.0 0 0 373450 8.0500 NaN S
#The Pandas library uses NaN, which stands for "not a number", to indicate a missing value.
#we can use the pandas.isnull() function which takes a pandas series and returns a series of True and False values
age = titanic_survival["Age"]
print(age.loc[0:10])
age_is_null = pd.isnull(age)
print (age_is_null)
age_null_true = age[age_is_null]
print (age_null_true)
age_null_count = len(age_null_true)
print(age_null_count)
0 22.0
1 38.0
2 26.0
3 35.0
4 35.0
5 NaN
6 54.0
7 2.0
8 27.0
9 14.0
10 4.0
Name: Age, dtype: float64
0 False
1 False
2 False
3 False
4 False
...
886 False
887 False
888 True
889 False
890 False
Name: Age, Length: 891, dtype: bool
5 NaN
17 NaN
19 NaN
26 NaN
28 NaN
..
859 NaN
863 NaN
868 NaN
878 NaN
888 NaN
Name: Age, Length: 177, dtype: float64
177
#The result of this is that mean_age would be nan. This is because any calculations we do with a null value also result in a null value
mean_age = sum(titanic_survival["Age"]) / len(titanic_survival["Age"])
print (mean_age)
nan
#we have to filter out the missing values before we calculate the mean.
good_ages = titanic_survival["Age"][age_is_null == False]
print (good_ages)
correct_mean_age = sum(good_ages) / len(good_ages)
print (correct_mean_age)
0 22.0
1 38.0
2 26.0
3 35.0
4 35.0
...
885 39.0
886 27.0
887 19.0
889 26.0
890 32.0
Name: Age, Length: 714, dtype: float64
29.69911764705882
missing data is so common that many pandas methods automatically filter for it
# missing data is so common that many pandas methods automatically filter for it
correct_mean_age = titanic_survival["Age"].mean()
print correct_mean_age
29.6991176471
#mean fare for each class
passenger_classes = [1, 2, 3]
fares_by_class = {}
for this_class in passenger_classes:
pclass_rows = titanic_survival[titanic_survival["Pclass"] == this_class]
pclass_fares = pclass_rows["Fare"]
fare_for_class = pclass_fares.mean()
fares_by_class[this_class] = fare_for_class
print (fares_by_class)
{1: 84.15468749999992, 2: 20.66218315217391, 3: 13.675550101832997}
passenger_survival
#index tells the method which column to group by
#values is the column that we want to apply the calculation to
#aggfunc specifies the calculation we want to perform
passenger_survival = titanic_survival.pivot_table(index="Pclass", values="Survived", aggfunc=np.mean)
print (passenger_survival)
Survived
Pclass
1 0.629630
2 0.472826
3 0.242363
passenger_age = titanic_survival.pivot_table(index="Pclass", values="Age")
print(passenger_age)
Age
Pclass
1 38.233441
2 29.877630
3 25.140620
port_stats = titanic_survival.pivot_table(index="Embarked", values=["Fare","Survived"], aggfunc=np.sum)
print(port_stats)
Fare Survived
Embarked
C 10072.2962 93
Q 1022.2543 30
S 17439.3988 217
#specifying axis=1 or axis='columns' will drop any columns that have null values
drop_na_columns = titanic_survival.dropna(axis=1)
new_titanic_survival = titanic_survival.dropna(axis=0,subset=["Age", "Sex"])
print (new_titanic_survival)
PassengerId Survived Pclass \
0 1 0 3
1 2 1 1
2 3 1 3
3 4 1 1
4 5 0 3
.. ... ... ...
885 886 0 3
886 887 0 2
887 888 1 1
889 890 1 1
890 891 0 3
Name Sex Age SibSp \
0 Braund, Mr. Owen Harris male 22.0 1
1 Cumings, Mrs. John Bradley (Florence Briggs Th... female 38.0 1
2 Heikkinen, Miss. Laina female 26.0 0
3 Futrelle, Mrs. Jacques Heath (Lily May Peel) female 35.0 1
4 Allen, Mr. William Henry male 35.0 0
.. ... ... ... ...
885 Rice, Mrs. William (Margaret Norton) female 39.0 0
886 Montvila, Rev. Juozas male 27.0 0
887 Graham, Miss. Margaret Edith female 19.0 0
889 Behr, Mr. Karl Howell male 26.0 0
890 Dooley, Mr. Patrick male 32.0 0
Parch Ticket Fare Cabin Embarked
0 0 A/5 21171 7.2500 NaN S
1 0 PC 17599 71.2833 C85 C
2 0 STON/O2. 3101282 7.9250 NaN S
3 0 113803 53.1000 C123 S
4 0 373450 8.0500 NaN S
.. ... ... ... ... ...
885 5 382652 29.1250 NaN Q
886 0 211536 13.0000 NaN S
887 0 112053 30.0000 B42 S
889 0 111369 30.0000 C148 C
890 0 370376 7.7500 NaN Q
[714 rows x 12 columns]
row_index_83_age
row_index_83_age = titanic_survival.loc[83,"Age"]
row_index_1000_pclass = titanic_survival.loc[766,"Pclass"]
print (row_index_83_age)
print (row_index_1000_pclass)
28.0
1
i
new_titanic_survival = titanic_survival.sort_values("Age",ascending=False)
print (new_titanic_survival[0:10])
itanic_reindexed = new_titanic_survival.reset_index(drop=True)
print (itanic_reindexed.iloc[0:10])
PassengerId Survived Pclass Name \
630 631 1 1 Barkworth, Mr. Algernon Henry Wilson
851 852 0 3 Svensson, Mr. Johan
493 494 0 1 Artagaveytia, Mr. Ramon
96 97 0 1 Goldschmidt, Mr. George B
116 117 0 3 Connors, Mr. Patrick
672 673 0 2 Mitchell, Mr. Henry Michael
745 746 0 1 Crosby, Capt. Edward Gifford
33 34 0 2 Wheadon, Mr. Edward H
54 55 0 1 Ostby, Mr. Engelhart Cornelius
280 281 0 3 Duane, Mr. Frank
Sex Age SibSp Parch Ticket Fare Cabin Embarked
630 male 80.0 0 0 27042 30.0000 A23 S
851 male 74.0 0 0 347060 7.7750 NaN S
493 male 71.0 0 0 PC 17609 49.5042 NaN C
96 male 71.0 0 0 PC 17754 34.6542 A5 C
116 male 70.5 0 0 370369 7.7500 NaN Q
672 male 70.0 0 0 C.A. 24580 10.5000 NaN S
745 male 70.0 1 1 WE/P 5735 71.0000 B22 S
33 male 66.0 0 0 C.A. 24579 10.5000 NaN S
54 male 65.0 0 1 113509 61.9792 B30 C
280 male 65.0 0 0 336439 7.7500 NaN Q
PassengerId Survived Pclass Name Sex \
0 631 1 1 Barkworth, Mr. Algernon Henry Wilson male
1 852 0 3 Svensson, Mr. Johan male
2 494 0 1 Artagaveytia, Mr. Ramon male
3 97 0 1 Goldschmidt, Mr. George B male
4 117 0 3 Connors, Mr. Patrick male
5 673 0 2 Mitchell, Mr. Henry Michael male
6 746 0 1 Crosby, Capt. Edward Gifford male
7 34 0 2 Wheadon, Mr. Edward H male
8 55 0 1 Ostby, Mr. Engelhart Cornelius male
9 281 0 3 Duane, Mr. Frank male
Age SibSp Parch Ticket Fare Cabin Embarked
0 80.0 0 0 27042 30.0000 A23 S
1 74.0 0 0 347060 7.7750 NaN S
2 71.0 0 0 PC 17609 49.5042 NaN C
3 71.0 0 0 PC 17754 34.6542 A5 C
4 70.5 0 0 370369 7.7500 NaN Q
5 70.0 0 0 C.A. 24580 10.5000 NaN S
6 70.0 1 1 WE/P 5735 71.0000 B22 S
7 66.0 0 0 C.A. 24579 10.5000 NaN S
8 65.0 0 1 113509 61.9792 B30 C
9 65.0 0 0 336439 7.7500 NaN Q
# This function returns the hundredth item from a series
def hundredth_row(column):
# Extract the hundredth item
hundredth_item = column.iloc[99]
return hundredth_item
# Return the hundredth item from each column
hundredth_row = titanic_survival.apply(hundredth_row)
print (hundredth_row)
PassengerId 100
Survived 0
Pclass 2
Name Kantor, Mr. Sinai
Sex male
Age 34
SibSp 1
Parch 0
Ticket 244367
Fare 26
Cabin NaN
Embarked S
dtype: object
def not_null_count(column):
column_null = pd.isnull(column)
null = column[column_null]
return len(null)
column_null_count = titanic_survival.apply(not_null_count)
print (column_null_count)
PassengerId 0
Survived 0
Pclass 0
Name 0
Sex 0
Age 177
SibSp 0
Parch 0
Ticket 0
Fare 0
Cabin 687
Embarked 2
dtype: int64
#By passing in the axis=1 argument, we can use the DataFrame.apply() method to iterate over rows instead of columns.
def which_class(row):
pclass = row['Pclass']
if pd.isnull(pclass):
return "Unknown"
elif pclass == 1:
return "First Class"
elif pclass == 2:
return "Second Class"
elif pclass == 3:
return "Third Class"
classes = titanic_survival.apply(which_class, axis=1)
print (classes)
0 Third Class
1 First Class
2 Third Class
3 First Class
4 Third Class
...
886 Second Class
887 First Class
888 Third Class
889 First Class
890 Third Class
Length: 891, dtype: object
minor
def is_minor(row):
if row["Age"] < 18:
return True
else:
return False
minors = titanic_survival.apply(is_minor, axis=1)
print (minors)
def generate_age_label(row):
age = row["Age"]
if pd.isnull(age):
return "unknown"
elif age < 18:
return "minor"
else:
return "adult"
age_labels = titanic_survival.apply(generate_age_label, axis=1)
print (age_labels)
0 False
1 False
2 False
3 False
4 False
...
886 False
887 False
888 False
889 False
890 False
Length: 891, dtype: bool
0 adult
1 adult
2 adult
3 adult
4 adult
...
886 adult
887 adult
888 unknown
889 adult
890 adult
Length: 891, dtype: object
titanic_survival['age_labels'] = age_labels
age_group_survival = titanic_survival.pivot_table(index="age_labels", values="Survived")
print (age_group_survival)
Survived
age_labels
adult 0.381032
minor 0.539823
unknown 0.293785
pandas4
#Series (collection of values)
#DataFrame (collection of Series objects)
#Panel (collection of DataFrame objects)
#A Series object can hold many data types, including
#float - for representing float values
#int - for representing integer values
#bool - for representing Boolean values
#datetime64[ns] - for representing date & time, without time-zone
#datetime64[ns, tz] - for representing date & time, with time-zone
#timedelta[ns] - for representing differences in dates & times (seconds, minutes, etc.)
#category - for representing categorical values
#object - for representing String values
#FILM - film name
#RottenTomatoes - Rotten Tomatoes critics average score
#RottenTomatoes_User - Rotten Tomatoes user average score
#RT_norm - Rotten Tomatoes critics average score (normalized to a 0 to 5 point system)
#RT_user_norm - Rotten Tomatoes user average score (normalized to a 0 to 5 point system)
#Metacritic - Metacritic critics average score
#Metacritic_User - Metacritic user average score
RottenTomatoes
import pandas as pd
fandango = pd.read_csv('fandango_score_comparison.csv')
series_film = fandango['FILM']
print(series_film[0:5])
series_rt = fandango['RottenTomatoes']
print (series_rt[0:5])
0 Avengers: Age of Ultron (2015)
1 Cinderella (2015)
2 Ant-Man (2015)
3 Do You Believe? (2015)
4 Hot Tub Time Machine 2 (2015)
Name: FILM, dtype: object
0 74
1 85
2 80
3 18
4 14
Name: RottenTomatoes, dtype: int64
Minions (2015)
# Import the Series object from pandas
from pandas import Series
film_names = series_film.values
print (type(film_names))
print (film_names)
rt_scores = series_rt.values
print (rt_scores)
series_custom = Series(rt_scores , index=film_names)
print (series_custom)
series_custom[['Minions (2015)', 'Leviathan (2014)']]
<class 'numpy.ndarray'>
['Avengers: Age of Ultron (2015)' 'Cinderella (2015)' 'Ant-Man (2015)'
'Do You Believe? (2015)' 'Hot Tub Time Machine 2 (2015)'
'The Water Diviner (2015)' 'Irrational Man (2015)' 'Top Five (2014)'
'Shaun the Sheep Movie (2015)' 'Love & Mercy (2015)'
'Far From The Madding Crowd (2015)' 'Black Sea (2015)' 'Leviathan (2014)'
'Unbroken (2014)' 'The Imitation Game (2014)' 'Taken 3 (2015)'
'Ted 2 (2015)' 'Southpaw (2015)'
'Night at the Museum: Secret of the Tomb (2014)' 'Pixels (2015)'
'McFarland, USA (2015)' 'Insidious: Chapter 3 (2015)'
'The Man From U.N.C.L.E. (2015)' 'Run All Night (2015)'
'Trainwreck (2015)' 'Selma (2014)' 'Ex Machina (2015)'
'Still Alice (2015)' 'Wild Tales (2014)' 'The End of the Tour (2015)'
'Red Army (2015)' 'When Marnie Was There (2015)'
'The Hunting Ground (2015)' 'The Boy Next Door (2015)' 'Aloha (2015)'
'The Loft (2015)' '5 Flights Up (2015)' 'Welcome to Me (2015)'
'Saint Laurent (2015)' 'Maps to the Stars (2015)'
"I'll See You In My Dreams (2015)" 'Timbuktu (2015)' 'About Elly (2015)'
'The Diary of a Teenage Girl (2015)'
'Kingsman: The Secret Service (2015)' 'Tomorrowland (2015)'
'The Divergent Series: Insurgent (2015)' 'Annie (2014)'
'Fantastic Four (2015)' 'Terminator Genisys (2015)'
'Pitch Perfect 2 (2015)' 'Entourage (2015)' 'The Age of Adaline (2015)'
'Hot Pursuit (2015)' 'The DUFF (2015)' 'Black or White (2015)'
'Project Almanac (2015)' 'Ricki and the Flash (2015)'
'Seventh Son (2015)' 'Mortdecai (2015)' 'Unfinished Business (2015)'
'American Ultra (2015)' 'True Story (2015)' 'Child 44 (2015)'
'Dark Places (2015)' 'Birdman (2014)' 'The Gift (2015)'
'Unfriended (2015)' 'Monkey Kingdom (2015)' 'Mr. Turner (2014)'
'Seymour: An Introduction (2015)' 'The Wrecking Crew (2015)'
'American Sniper (2015)' 'Furious 7 (2015)'
'The Hobbit: The Battle of the Five Armies (2014)' 'San Andreas (2015)'
'Straight Outta Compton (2015)' 'Vacation (2015)' 'Chappie (2015)'
'Poltergeist (2015)' 'Paper Towns (2015)' 'Big Eyes (2014)'
'Blackhat (2015)' 'Self/less (2015)' 'Sinister 2 (2015)'
'Little Boy (2015)' 'Me and Earl and The Dying Girl (2015)'
'Maggie (2015)' 'Mad Max: Fury Road (2015)' 'Spy (2015)'
'The SpongeBob Movie: Sponge Out of Water (2015)' 'Paddington (2015)'
'Dope (2015)' 'What We Do in the Shadows (2015)' 'The Overnight (2015)'
'The Salt of the Earth (2015)' 'Song of the Sea (2014)'
'Fifty Shades of Grey (2015)' 'Get Hard (2015)' 'Focus (2015)'
'Jupiter Ascending (2015)' 'The Gallows (2015)'
'The Second Best Exotic Marigold Hotel (2015)' 'Strange Magic (2015)'
'The Gunman (2015)' 'Hitman: Agent 47 (2015)' 'Cake (2015)'
'The Vatican Tapes (2015)' 'A Little Chaos (2015)'
'The 100-Year-Old Man Who Climbed Out the Window and Disappeared (2015)'
'Escobar: Paradise Lost (2015)' 'Into the Woods (2014)'
'It Follows (2015)' 'Inherent Vice (2014)' 'A Most Violent Year (2014)'
"While We're Young (2015)" 'Clouds of Sils Maria (2015)'
'Testament of Youth (2015)' 'Infinitely Polar Bear (2015)'
'Phoenix (2015)' 'The Wolfpack (2015)'
'The Stanford Prison Experiment (2015)' 'Tangerine (2015)'
'Magic Mike XXL (2015)' 'Home (2015)' 'The Wedding Ringer (2015)'
'Woman in Gold (2015)' 'The Last Five Years (2015)'
'Mission: Impossible – Rogue Nation (2015)' 'Amy (2015)'
'Jurassic World (2015)' 'Minions (2015)' 'Max (2015)'
'Paul Blart: Mall Cop 2 (2015)' 'The Longest Ride (2015)'
'The Lazarus Effect (2015)' 'The Woman In Black 2 Angel of Death (2015)'
'Danny Collins (2015)' 'Spare Parts (2015)' 'Serena (2015)'
'Inside Out (2015)' 'Mr. Holmes (2015)' "'71 (2015)"
'Two Days, One Night (2014)' 'Gett: The Trial of Viviane Amsalem (2015)'
'Kumiko, The Treasure Hunter (2015)']
[ 74 85 80 18 14 63 42 86 99 89 84 82 99 51 90 9 46 59
50 17 79 59 68 60 85 99 92 88 96 92 96 89 92 10 19 11
52 71 51 60 94 99 97 95 75 50 30 27 9 26 67 32 54 8
71 39 34 64 12 12 11 46 45 26 26 92 93 60 94 98 100 93
72 81 61 50 90 27 30 31 55 72 34 20 13 20 81 54 97 93
78 98 87 96 82 96 99 25 29 57 26 16 62 17 17 7 49 13
40 67 52 71 96 73 90 83 89 81 80 99 84 84 95 62 45 27
52 60 92 97 71 54 35 5 31 14 22 77 52 18 98 87 97 97
100 87]
Avengers: Age of Ultron (2015) 74
Cinderella (2015) 85
Ant-Man (2015) 80
Do You Believe? (2015) 18
Hot Tub Time Machine 2 (2015) 14
...
Mr. Holmes (2015) 87
'71 (2015) 97
Two Days, One Night (2014) 97
Gett: The Trial of Viviane Amsalem (2015) 100
Kumiko, The Treasure Hunter (2015) 87
Length: 146, dtype: int64
Minions (2015) 54
Leviathan (2014) 99
dtype: int64
# int index is also aviable
series_custom = Series(rt_scores , index=film_names)
series_custom[['Minions (2015)', 'Leviathan (2014)']]
fiveten = series_custom[5:10]
print(fiveten)
The Water Diviner (2015) 63
Irrational Man (2015) 42
Top Five (2014) 86
Shaun the Sheep Movie (2015) 99
Love & Mercy (2015) 89
dtype: int64
original_index = series_custom.index.tolist()
print (original_index)
sorted_index = sorted(original_index)
sorted_by_index = series_custom.reindex(sorted_index)
print (sorted_by_index)
['Avengers: Age of Ultron (2015)', 'Cinderella (2015)', 'Ant-Man (2015)', 'Do You Believe? (2015)', 'Hot Tub Time Machine 2 (2015)', 'The Water Diviner (2015)', 'Irrational Man (2015)', 'Top Five (2014)', 'Shaun the Sheep Movie (2015)', 'Love & Mercy (2015)', 'Far From The Madding Crowd (2015)', 'Black Sea (2015)', 'Leviathan (2014)', 'Unbroken (2014)', 'The Imitation Game (2014)', 'Taken 3 (2015)', 'Ted 2 (2015)', 'Southpaw (2015)', 'Night at the Museum: Secret of the Tomb (2014)', 'Pixels (2015)', 'McFarland, USA (2015)', 'Insidious: Chapter 3 (2015)', 'The Man From U.N.C.L.E. (2015)', 'Run All Night (2015)', 'Trainwreck (2015)', 'Selma (2014)', 'Ex Machina (2015)', 'Still Alice (2015)', 'Wild Tales (2014)', 'The End of the Tour (2015)', 'Red Army (2015)', 'When Marnie Was There (2015)', 'The Hunting Ground (2015)', 'The Boy Next Door (2015)', 'Aloha (2015)', 'The Loft (2015)', '5 Flights Up (2015)', 'Welcome to Me (2015)', 'Saint Laurent (2015)', 'Maps to the Stars (2015)', "I'll See You In My Dreams (2015)", 'Timbuktu (2015)', 'About Elly (2015)', 'The Diary of a Teenage Girl (2015)', 'Kingsman: The Secret Service (2015)', 'Tomorrowland (2015)', 'The Divergent Series: Insurgent (2015)', 'Annie (2014)', 'Fantastic Four (2015)', 'Terminator Genisys (2015)', 'Pitch Perfect 2 (2015)', 'Entourage (2015)', 'The Age of Adaline (2015)', 'Hot Pursuit (2015)', 'The DUFF (2015)', 'Black or White (2015)', 'Project Almanac (2015)', 'Ricki and the Flash (2015)', 'Seventh Son (2015)', 'Mortdecai (2015)', 'Unfinished Business (2015)', 'American Ultra (2015)', 'True Story (2015)', 'Child 44 (2015)', 'Dark Places (2015)', 'Birdman (2014)', 'The Gift (2015)', 'Unfriended (2015)', 'Monkey Kingdom (2015)', 'Mr. Turner (2014)', 'Seymour: An Introduction (2015)', 'The Wrecking Crew (2015)', 'American Sniper (2015)', 'Furious 7 (2015)', 'The Hobbit: The Battle of the Five Armies (2014)', 'San Andreas (2015)', 'Straight Outta Compton (2015)', 'Vacation (2015)', 'Chappie (2015)', 'Poltergeist (2015)', 'Paper Towns (2015)', 'Big Eyes (2014)', 'Blackhat (2015)', 'Self/less (2015)', 'Sinister 2 (2015)', 'Little Boy (2015)', 'Me and Earl and The Dying Girl (2015)', 'Maggie (2015)', 'Mad Max: Fury Road (2015)', 'Spy (2015)', 'The SpongeBob Movie: Sponge Out of Water (2015)', 'Paddington (2015)', 'Dope (2015)', 'What We Do in the Shadows (2015)', 'The Overnight (2015)', 'The Salt of the Earth (2015)', 'Song of the Sea (2014)', 'Fifty Shades of Grey (2015)', 'Get Hard (2015)', 'Focus (2015)', 'Jupiter Ascending (2015)', 'The Gallows (2015)', 'The Second Best Exotic Marigold Hotel (2015)', 'Strange Magic (2015)', 'The Gunman (2015)', 'Hitman: Agent 47 (2015)', 'Cake (2015)', 'The Vatican Tapes (2015)', 'A Little Chaos (2015)', 'The 100-Year-Old Man Who Climbed Out the Window and Disappeared (2015)', 'Escobar: Paradise Lost (2015)', 'Into the Woods (2014)', 'It Follows (2015)', 'Inherent Vice (2014)', 'A Most Violent Year (2014)', "While We're Young (2015)", 'Clouds of Sils Maria (2015)', 'Testament of Youth (2015)', 'Infinitely Polar Bear (2015)', 'Phoenix (2015)', 'The Wolfpack (2015)', 'The Stanford Prison Experiment (2015)', 'Tangerine (2015)', 'Magic Mike XXL (2015)', 'Home (2015)', 'The Wedding Ringer (2015)', 'Woman in Gold (2015)', 'The Last Five Years (2015)', 'Mission: Impossible – Rogue Nation (2015)', 'Amy (2015)', 'Jurassic World (2015)', 'Minions (2015)', 'Max (2015)', 'Paul Blart: Mall Cop 2 (2015)', 'The Longest Ride (2015)', 'The Lazarus Effect (2015)', 'The Woman In Black 2 Angel of Death (2015)', 'Danny Collins (2015)', 'Spare Parts (2015)', 'Serena (2015)', 'Inside Out (2015)', 'Mr. Holmes (2015)', "'71 (2015)", 'Two Days, One Night (2014)', 'Gett: The Trial of Viviane Amsalem (2015)', 'Kumiko, The Treasure Hunter (2015)']
'71 (2015) 97
5 Flights Up (2015) 52
A Little Chaos (2015) 40
A Most Violent Year (2014) 90
About Elly (2015) 97
..
What We Do in the Shadows (2015) 96
When Marnie Was There (2015) 89
While We're Young (2015) 83
Wild Tales (2014) 96
Woman in Gold (2015) 52
Length: 146, dtype: int64
sc2 = series_custom.sort_index()
sc3 = series_custom.sort_values()
print(sc2[0:10])
print(sc3[0:10])
'71 (2015) 97
5 Flights Up (2015) 52
A Little Chaos (2015) 40
A Most Violent Year (2014) 90
About Elly (2015) 97
Aloha (2015) 19
American Sniper (2015) 72
American Ultra (2015) 46
Amy (2015) 97
Annie (2014) 27
dtype: int64
Paul Blart: Mall Cop 2 (2015) 5
Hitman: Agent 47 (2015) 7
Hot Pursuit (2015) 8
Fantastic Four (2015) 9
Taken 3 (2015) 9
The Boy Next Door (2015) 10
The Loft (2015) 11
Unfinished Business (2015) 11
Mortdecai (2015) 12
Seventh Son (2015) 12
dtype: int64
series_custom
#The values in a Series object are treated as an ndarray, the core data type in NumPy
import numpy as np
# Add each value with each other
print( np.add(series_custom, series_custom))
# Apply sine function to each value
print (np.sin(series_custom))
# Return the highest value (will return a single value not a Series)
print (np.max(series_custom))
Avengers: Age of Ultron (2015) 148
Cinderella (2015) 170
Ant-Man (2015) 160
Do You Believe? (2015) 36
Hot Tub Time Machine 2 (2015) 28
...
Mr. Holmes (2015) 174
'71 (2015) 194
Two Days, One Night (2014) 194
Gett: The Trial of Viviane Amsalem (2015) 200
Kumiko, The Treasure Hunter (2015) 174
Length: 146, dtype: int64
Avengers: Age of Ultron (2015) -0.985146
Cinderella (2015) -0.176076
Ant-Man (2015) -0.993889
Do You Believe? (2015) -0.750987
Hot Tub Time Machine 2 (2015) 0.990607
...
Mr. Holmes (2015) -0.821818
'71 (2015) 0.379608
Two Days, One Night (2014) 0.379608
Gett: The Trial of Viviane Amsalem (2015) -0.506366
Kumiko, The Treasure Hunter (2015) -0.821818
Length: 146, dtype: float64
100
#will actually return a Series object with a boolean value for each film
#will actually return a Series object with a boolean value for each film
series_custom > 50
series_greater_than_50 = series_custom[series_custom > 50]
print (series_greater_than_50)
criteria_one = series_custom > 50
criteria_two = series_custom < 75
both_criteria = series_custom[criteria_one & criteria_two]
print (both_criteria)
Avengers: Age of Ultron (2015) 74
Cinderella (2015) 85
Ant-Man (2015) 80
The Water Diviner (2015) 63
Top Five (2014) 86
...
Mr. Holmes (2015) 87
'71 (2015) 97
Two Days, One Night (2014) 97
Gett: The Trial of Viviane Amsalem (2015) 100
Kumiko, The Treasure Hunter (2015) 87
Length: 94, dtype: int64
Avengers: Age of Ultron (2015) 74
The Water Diviner (2015) 63
Unbroken (2014) 51
Southpaw (2015) 59
Insidious: Chapter 3 (2015) 59
The Man From U.N.C.L.E. (2015) 68
Run All Night (2015) 60
5 Flights Up (2015) 52
Welcome to Me (2015) 71
Saint Laurent (2015) 51
Maps to the Stars (2015) 60
Pitch Perfect 2 (2015) 67
The Age of Adaline (2015) 54
The DUFF (2015) 71
Ricki and the Flash (2015) 64
Unfriended (2015) 60
American Sniper (2015) 72
The Hobbit: The Battle of the Five Armies (2014) 61
Paper Towns (2015) 55
Big Eyes (2014) 72
Maggie (2015) 54
Focus (2015) 57
The Second Best Exotic Marigold Hotel (2015) 62
The 100-Year-Old Man Who Climbed Out the Window and Disappeared (2015) 67
Escobar: Paradise Lost (2015) 52
Into the Woods (2014) 71
Inherent Vice (2014) 73
Magic Mike XXL (2015) 62
Woman in Gold (2015) 52
The Last Five Years (2015) 60
Jurassic World (2015) 71
Minions (2015) 54
Spare Parts (2015) 52
dtype: int64
#data alignment same index
rt_critics = Series(fandango['RottenTomatoes'].values, index=fandango['FILM'])
rt_users = Series(fandango['RottenTomatoes_User'].values, index=fandango['FILM'])
rt_mean = (rt_critics + rt_users)/2
print(rt_mean)
FILM
Avengers: Age of Ultron (2015) 80.0
Cinderella (2015) 82.5
Ant-Man (2015) 85.0
Do You Believe? (2015) 51.0
Hot Tub Time Machine 2 (2015) 21.0
...
Mr. Holmes (2015) 82.5
'71 (2015) 89.5
Two Days, One Night (2014) 87.5
Gett: The Trial of Viviane Amsalem (2015) 90.5
Kumiko, The Treasure Hunter (2015) 75.0
Length: 146, dtype: float64
pandas5
import pandas as pd
#will return a new DataFrame that is indexed by the values in the specified column
#and will drop that column from the DataFrame
#without the FILM column dropped
fandango = pd.read_csv('fandango_score_comparison.csv')
print (type(fandango))
print (fandango.index)
fandango_films = fandango.set_index('FILM', drop=False)
print(fandango_films.index)
<class 'pandas.core.frame.DataFrame'>
RangeIndex(start=0, stop=146, step=1)
Index(['Avengers: Age of Ultron (2015)', 'Cinderella (2015)', 'Ant-Man (2015)',
'Do You Believe? (2015)', 'Hot Tub Time Machine 2 (2015)',
'The Water Diviner (2015)', 'Irrational Man (2015)', 'Top Five (2014)',
'Shaun the Sheep Movie (2015)', 'Love & Mercy (2015)',
...
'The Woman In Black 2 Angel of Death (2015)', 'Danny Collins (2015)',
'Spare Parts (2015)', 'Serena (2015)', 'Inside Out (2015)',
'Mr. Holmes (2015)', ''71 (2015)', 'Two Days, One Night (2014)',
'Gett: The Trial of Viviane Amsalem (2015)',
'Kumiko, The Treasure Hunter (2015)'],
dtype='object', name='FILM', length=146)
# Slice using either bracket notation or loc[]
fandango_films["Avengers: Age of Ultron (2015)":"Hot Tub Time Machine 2 (2015)"]
fandango_films.loc["Avengers: Age of Ultron (2015)":"Hot Tub Time Machine 2 (2015)"]
# Specific movie
fandango_films.loc['Kumiko, The Treasure Hunter (2015)']
# Selecting list of movies
movies = ['Kumiko, The Treasure Hunter (2015)', 'Do You Believe? (2015)', 'Ant-Man (2015)']
fandango_films.loc[movies]
#When selecting multiple rows, a DataFrame is returned,
#but when selecting an individual row, a Series object is returned instead
FILM RottenTomatoes RottenTomatoes_User Metacritic Metacritic_User IMDB Fandango_Stars Fandango_Ratingvalue RT_norm RT_user_norm ... IMDB_norm RT_norm_round RT_user_norm_round Metacritic_norm_round Metacritic_user_norm_round IMDB_norm_round Metacritic_user_vote_count IMDB_user_vote_count Fandango_votes Fandango_Difference
FILM
Kumiko, The Treasure Hunter (2015) Kumiko, The Treasure Hunter (2015) 87 63 68 6.4 6.7 3.5 3.5 4.35 3.15 ... 3.35 4.5 3.0 3.5 3.0 3.5 19 5289 41 0.0
Do You Believe? (2015) Do You Believe? (2015) 18 84 22 4.7 5.4 5.0 4.5 0.90 4.20 ... 2.70 1.0 4.0 1.0 2.5 2.5 31 3136 1793 0.5
Ant-Man (2015) Ant-Man (2015) 80 90 64 8.1 7.8 5.0 4.5 4.00 4.50 ... 3.90 4.0 4.5 3.0 4.0 4.0 627 103660 12055 0.5
3 rows × 22 columns
#The apply() method in Pandas allows us to specify Python logic
#The apply() method requires you to pass in a vectorized operation
#that can be applied over each Series object.
import numpy as np
# returns the data types as a Series
types = fandango_films.dtypes
print (types)
# filter data types to just floats, index attributes returns just column names
float_columns = types[types.values == 'float64'].index
# use bracket notation to filter columns to just float columns
float_df = fandango_films[float_columns]
print (float_df)
# `x` is a Series object representing a column
deviations = float_df.apply(lambda x: np.std(x))
print(deviations)
FILM object
RottenTomatoes int64
RottenTomatoes_User int64
Metacritic int64
Metacritic_User float64
IMDB float64
Fandango_Stars float64
Fandango_Ratingvalue float64
RT_norm float64
RT_user_norm float64
Metacritic_norm float64
Metacritic_user_nom float64
IMDB_norm float64
RT_norm_round float64
RT_user_norm_round float64
Metacritic_norm_round float64
Metacritic_user_norm_round float64
IMDB_norm_round float64
Metacritic_user_vote_count int64
IMDB_user_vote_count int64
Fandango_votes int64
Fandango_Difference float64
dtype: object
Metacritic_User IMDB \
FILM
Avengers: Age of Ultron (2015) 7.1 7.8
Cinderella (2015) 7.5 7.1
Ant-Man (2015) 8.1 7.8
Do You Believe? (2015) 4.7 5.4
Hot Tub Time Machine 2 (2015) 3.4 5.1
... ... ...
Mr. Holmes (2015) 7.9 7.4
'71 (2015) 7.5 7.2
Two Days, One Night (2014) 8.8 7.4
Gett: The Trial of Viviane Amsalem (2015) 7.3 7.8
Kumiko, The Treasure Hunter (2015) 6.4 6.7
Fandango_Stars \
FILM
Avengers: Age of Ultron (2015) 5.0
Cinderella (2015) 5.0
Ant-Man (2015) 5.0
Do You Believe? (2015) 5.0
Hot Tub Time Machine 2 (2015) 3.5
... ...
Mr. Holmes (2015) 4.0
'71 (2015) 3.5
Two Days, One Night (2014) 3.5
Gett: The Trial of Viviane Amsalem (2015) 3.5
Kumiko, The Treasure Hunter (2015) 3.5
Fandango_Ratingvalue RT_norm \
FILM
Avengers: Age of Ultron (2015) 4.5 3.70
Cinderella (2015) 4.5 4.25
Ant-Man (2015) 4.5 4.00
Do You Believe? (2015) 4.5 0.90
Hot Tub Time Machine 2 (2015) 3.0 0.70
... ... ...
Mr. Holmes (2015) 4.0 4.35
'71 (2015) 3.5 4.85
Two Days, One Night (2014) 3.5 4.85
Gett: The Trial of Viviane Amsalem (2015) 3.5 5.00
Kumiko, The Treasure Hunter (2015) 3.5 4.35
RT_user_norm Metacritic_norm \
FILM
Avengers: Age of Ultron (2015) 4.30 3.30
Cinderella (2015) 4.00 3.35
Ant-Man (2015) 4.50 3.20
Do You Believe? (2015) 4.20 1.10
Hot Tub Time Machine 2 (2015) 1.40 1.45
... ... ...
Mr. Holmes (2015) 3.90 3.35
'71 (2015) 4.10 4.15
Two Days, One Night (2014) 3.90 4.45
Gett: The Trial of Viviane Amsalem (2015) 4.05 4.50
Kumiko, The Treasure Hunter (2015) 3.15 3.40
Metacritic_user_nom IMDB_norm \
FILM
Avengers: Age of Ultron (2015) 3.55 3.90
Cinderella (2015) 3.75 3.55
Ant-Man (2015) 4.05 3.90
Do You Believe? (2015) 2.35 2.70
Hot Tub Time Machine 2 (2015) 1.70 2.55
... ... ...
Mr. Holmes (2015) 3.95 3.70
'71 (2015) 3.75 3.60
Two Days, One Night (2014) 4.40 3.70
Gett: The Trial of Viviane Amsalem (2015) 3.65 3.90
Kumiko, The Treasure Hunter (2015) 3.20 3.35
RT_norm_round RT_user_norm_round \
FILM
Avengers: Age of Ultron (2015) 3.5 4.5
Cinderella (2015) 4.5 4.0
Ant-Man (2015) 4.0 4.5
Do You Believe? (2015) 1.0 4.0
Hot Tub Time Machine 2 (2015) 0.5 1.5
... ... ...
Mr. Holmes (2015) 4.5 4.0
'71 (2015) 5.0 4.0
Two Days, One Night (2014) 5.0 4.0
Gett: The Trial of Viviane Amsalem (2015) 5.0 4.0
Kumiko, The Treasure Hunter (2015) 4.5 3.0
Metacritic_norm_round \
FILM
Avengers: Age of Ultron (2015) 3.5
Cinderella (2015) 3.5
Ant-Man (2015) 3.0
Do You Believe? (2015) 1.0
Hot Tub Time Machine 2 (2015) 1.5
... ...
Mr. Holmes (2015) 3.5
'71 (2015) 4.0
Two Days, One Night (2014) 4.5
Gett: The Trial of Viviane Amsalem (2015) 4.5
Kumiko, The Treasure Hunter (2015) 3.5
Metacritic_user_norm_round \
FILM
Avengers: Age of Ultron (2015) 3.5
Cinderella (2015) 4.0
Ant-Man (2015) 4.0
Do You Believe? (2015) 2.5
Hot Tub Time Machine 2 (2015) 1.5
... ...
Mr. Holmes (2015) 4.0
'71 (2015) 4.0
Two Days, One Night (2014) 4.5
Gett: The Trial of Viviane Amsalem (2015) 3.5
Kumiko, The Treasure Hunter (2015) 3.0
IMDB_norm_round \
FILM
Avengers: Age of Ultron (2015) 4.0
Cinderella (2015) 3.5
Ant-Man (2015) 4.0
Do You Believe? (2015) 2.5
Hot Tub Time Machine 2 (2015) 2.5
... ...
Mr. Holmes (2015) 3.5
'71 (2015) 3.5
Two Days, One Night (2014) 3.5
Gett: The Trial of Viviane Amsalem (2015) 4.0
Kumiko, The Treasure Hunter (2015) 3.5
Fandango_Difference
FILM
Avengers: Age of Ultron (2015) 0.5
Cinderella (2015) 0.5
Ant-Man (2015) 0.5
Do You Believe? (2015) 0.5
Hot Tub Time Machine 2 (2015) 0.5
... ...
Mr. Holmes (2015) 0.0
'71 (2015) 0.0
Two Days, One Night (2014) 0.0
Gett: The Trial of Viviane Amsalem (2015) 0.0
Kumiko, The Treasure Hunter (2015) 0.0
[146 rows x 15 columns]
Metacritic_User 1.505529
IMDB 0.955447
Fandango_Stars 0.538532
Fandango_Ratingvalue 0.501106
RT_norm 1.503265
RT_user_norm 0.997787
Metacritic_norm 0.972522
Metacritic_user_nom 0.752765
IMDB_norm 0.477723
RT_norm_round 1.509404
RT_user_norm_round 1.003559
Metacritic_norm_round 0.987561
Metacritic_user_norm_round 0.785412
IMDB_norm_round 0.501043
Fandango_Difference 0.152141
dtype: float64
rt_mt_user = float_df[['RT_user_norm', 'Metacritic_user_nom']]
print (rt_mt_user)
rt_mt_user.apply(lambda x: np.std(x), axis=1)
RT_user_norm Metacritic_user_nom
FILM
Avengers: Age of Ultron (2015) 4.30 3.55
Cinderella (2015) 4.00 3.75
Ant-Man (2015) 4.50 4.05
Do You Believe? (2015) 4.20 2.35
Hot Tub Time Machine 2 (2015) 1.40 1.70
... ... ...
Mr. Holmes (2015) 3.90 3.95
'71 (2015) 4.10 3.75
Two Days, One Night (2014) 3.90 4.40
Gett: The Trial of Viviane Amsalem (2015) 4.05 3.65
Kumiko, The Treasure Hunter (2015) 3.15 3.20
[146 rows x 2 columns]
FILM
Avengers: Age of Ultron (2015) 0.375
Cinderella (2015) 0.125
Ant-Man (2015) 0.225
Do You Believe? (2015) 0.925
Hot Tub Time Machine 2 (2015) 0.150
...
Mr. Holmes (2015) 0.025
'71 (2015) 0.175
Two Days, One Night (2014) 0.250
Gett: The Trial of Viviane Amsalem (2015) 0.200
Kumiko, The Treasure Hunter (2015) 0.025
Length: 146, dtype: float64
网友评论