import pandas
food_info = pandas.read_csv("food_info.csv")
#print(type(food_info))
print food_info.dtypes
输出 :
NDB_No int64
Shrt_Desc object
Water_(g) float64
Energ_Kcal int64
Protein_(g) float64
Lipid_Tot_(g) float64
Ash_(g) float64
Carbohydrt_(g) float64
Fiber_TD_(g) float64
Sugar_Tot_(g) float64
Calcium_(mg) float64
Iron_(mg) float64
Magnesium_(mg) float64
Phosphorus_(mg) float64
Potassium_(mg) float64
Sodium_(mg) float64
Zinc_(mg) float64
Copper_(mg) float64
Manganese_(mg) float64
Selenium_(mcg) float64
Vit_C_(mg) float64
Thiamin_(mg) float64
Riboflavin_(mg) float64
Niacin_(mg) float64
Vit_B6_(mg) float64
Vit_B12_(mcg) float64
Vit_A_IU float64
Vit_A_RAE float64
Vit_E_(mg) float64
Vit_D_mcg float64
Vit_D_IU float64
Vit_K_(mcg) float64
FA_Sat_(g) float64
FA_Mono_(g) float64
FA_Poly_(g) float64
Cholestrl_(mg) float64
dtype: object
#first_rows = food_info.head()
#print first_rows
#print(food_info.head(3))
#print food_info.columns
#print food_info.shape
输出 : (8618, 36)
#pandas uses zero-indexing
#Series object representing the row at index 0.
#print food_info.loc[0]
# Series object representing the seventh row.
#food_info.loc[6]
# Will throw an error: "KeyError: 'the label [8620] is not in the [index]'"
#food_info.loc[8620]
#The object dtype is equivalent to a string in Python
#object - For string values
#int - For integer values
#float - For float values
#datetime - For time values
#bool - For Boolean values
#print(food_info.dtypes)
# Returns a DataFrame containing the rows at indexes 3, 4, 5, and 6.
#food_info.loc[3:6]
# Returns a DataFrame containing the rows at indexes 2, 5, and 10. Either of the following approaches will work.
# Method 1
#two_five_ten = [2,5,10]
#food_info.loc[two_five_ten]
# Method 2
#food_info.loc[[2,5,10]]
# Series object representing the "NDB_No" column.
#ndb_col = food_info["NDB_No"]
#print ndb_col
# Alternatively, you can access a column by passing in a string variable.
#col_name = "NDB_No"
#ndb_col = food_info[col_name]
#columns = ["Zinc_(mg)", "Copper_(mg)"]
#zinc_copper = food_info[columns]
#print zinc_copper
#print zinc_copper
# Skipping the assignment.
#zinc_copper = food_info[["Zinc_(mg)", "Copper_(mg)"]]
#print(food_info.columns)
#print(food_info.head(2))
col_names = food_info.columns.tolist()
#print col_names
gram_columns = []
for c in col_names:
if c.endswith("(g)"):
gram_columns.append(c)
gram_df = food_info[gram_columns]
print(gram_df.head(3))
import pandas
food_info = pandas.read_csv("food_info.csv")
col_names = food_info.columns.tolist()
print(col_names)
print(food_info.head(3))
#print food_info["Iron_(mg)"]
#div_1000 = food_info["Iron_(mg)"] / 1000
#print div_1000
# Adds 100 to each value in the column and returns a Series object.
#add_100 = food_info["Iron_(mg)"] + 100
# Subtracts 100 from each value in the column and returns a Series object.
#sub_100 = food_info["Iron_(mg)"] - 100
# Multiplies each value in the column by 2 and returns a Series object.
#mult_2 = food_info["Iron_(mg)"]*2
#It applies the arithmetic operator to the first value in both columns, the second value in both columns, and so on
water_energy = food_info["Water_(g)"] * food_info["Energ_Kcal"]
water_energy = food_info["Water_(g)"] * food_info["Energ_Kcal"]
iron_grams = food_info["Iron_(mg)"] / 1000
food_info["Iron_(g)"] = iron_grams
#Score=2×(Protein_(g))−0.75×(Lipid_Tot_(g))
weighted_protein = food_info["Protein_(g)"] * 2
weighted_fat = -0.75 * food_info["Lipid_Tot_(g)"]
initial_rating = weighted_protein + weighted_fat
# the "Vit_A_IU" column ranges from 0 to 100000, while the "Fiber_TD_(g)" column ranges from 0 to 79
#For certain calculations, columns like "Vit_A_IU" can have a greater effect on the result,
#due to the scale of the values
# The largest value in the "Energ_Kcal" column.
max_calories = food_info["Energ_Kcal"].max()
# Divide the values in "Energ_Kcal" by the largest value.
normalized_calories = food_info["Energ_Kcal"] / max_calories
normalized_protein = food_info["Protein_(g)"] / food_info["Protein_(g)"].max()
normalized_fat = food_info["Lipid_Tot_(g)"] / food_info["Lipid_Tot_(g)"].max()
food_info["Normalized_Protein"] = normalized_protein
food_info["Normalized_Fat"] = normalized_fat
#By default, pandas will sort the data by the column we specify in ascending order and return a new DataFrame
# Sorts the DataFrame in-place, rather than returning a new DataFrame.
#print food_info["Sodium_(mg)"]
food_info.sort_values("Sodium_(mg)", inplace=True)
print food_info["Sodium_(mg)"]
#Sorts by descending order, rather than ascending.
food_info.sort_values("Sodium_(mg)", inplace=True, ascending=False)
print food_info["Sodium_(mg)"]
输出 :
760 0.0
610 0.0
611 0.0
8387 0.0
8607 0.0
629 0.0
630 0.0
631 0.0
6470 0.0
654 0.0
8599 0.0
633 0.0
634 0.0
635 0.0
637 0.0
638 0.0
639 0.0
646 0.0
653 0.0
632 0.0
606 0.0
6463 0.0
655 0.0
673 0.0
658 0.0
3636 0.0
659 0.0
660 0.0
661 0.0
3663 0.0
...
8153 NaN
8155 NaN
8156 NaN
8157 NaN
8158 NaN
8159 NaN
8160 NaN
8161 NaN
8163 NaN
8164 NaN
8165 NaN
8167 NaN
8169 NaN
8170 NaN
8172 NaN
8173 NaN
8174 NaN
8175 NaN
8176 NaN
8177 NaN
8178 NaN
8179 NaN
8180 NaN
8181 NaN
8183 NaN
8184 NaN
8185 NaN
8195 NaN
8251 NaN
8267 NaN
Name: Sodium_(mg), dtype: float64
276 38758.0
5814 27360.0
6192 26050.0
1242 26000.0
1245 24000.0
1243 24000.0
1244 23875.0
292 17000.0
1254 11588.0
5811 10600.0
8575 9690.0
291 8068.0
1249 8031.0
5812 7893.0
1292 7851.0
293 7203.0
4472 7027.0
4836 6820.0
1261 6580.0
3747 6008.0
1266 5730.0
4835 5586.0
4834 5493.0
1263 5356.0
1553 5203.0
1552 5053.0
1251 4957.0
1257 4843.0
294 4616.0
8613 4450.0
...
8153 NaN
8155 NaN
8156 NaN
8157 NaN
8158 NaN
8159 NaN
8160 NaN
8161 NaN
8163 NaN
8164 NaN
8165 NaN
8167 NaN
8169 NaN
8170 NaN
8172 NaN
8173 NaN
8174 NaN
8175 NaN
8176 NaN
8177 NaN
8178 NaN
8179 NaN
8180 NaN
8181 NaN
8183 NaN
8184 NaN
8185 NaN
8195 NaN
8251 NaN
8267 NaN
Name: Sodium_(mg), dtype: float64
import pandas as pd
import numpy as np
titanic_survival = pd.read_csv("titanic_train.csv")
titanic_survival.head()
#The Pandas library uses NaN, which stands for "not a number", to indicate a missing value.
#we can use the pandas.isnull() function which takes a pandas series and returns a series of True and False values
age = titanic_survival["Age"]
#print(age.loc[0:10])
age_is_null = pd.isnull(age)
#print age_is_null
age_null_true = age[age_is_null]
#print age_null_true
age_null_count = len(age_null_true)
print(age_null_count)
输出 : 177
#The result of this is that mean_age would be nan. This is because any calculations we do with a null value also result in a null value
mean_age = sum(titanic_survival["Age"]) / len(titanic_survival["Age"])
print mean_age
输出 : nan
#we have to filter out the missing values before we calculate the mean.
good_ages = titanic_survival["Age"][age_is_null == False]
#print good_ages
correct_mean_age = sum(good_ages) / len(good_ages)
print correct_mean_age
输出 : 29.6991176471
# missing data is so common that many pandas methods automatically filter for it
correct_mean_age = titanic_survival["Age"].mean()
print correct_mean_age
输出 : 29.6991176471
#mean fare for each class
passenger_classes = [1, 2, 3]
fares_by_class = {}
for this_class in passenger_classes:
pclass_rows = titanic_survival[titanic_survival["Pclass"] == this_class]
pclass_fares = pclass_rows["Fare"]
fare_for_class = pclass_fares.mean()
fares_by_class[this_class] = fare_for_class
print fares_by_class
输出 : {1: 84.154687499999994, 2: 20.662183152173913, 3: 13.675550101832993}
#index tells the method which column to group by
#values is the column that we want to apply the calculation to
#aggfunc specifies the calculation we want to perform
passenger_survival = titanic_survival.pivot_table(index="Pclass", values="Survived", aggfunc=np.mean)
print passenger_survival
输出 :
Pclass
1 0.629630
2 0.472826
3 0.242363
Name: Survived, dtype: float64
passenger_age = titanic_survival.pivot_table(index="Pclass", values="Age")
print(passenger_age)
输出 :
Pclass
1 38.233441
2 29.877630
3 25.140620
Name: Age, dtype: float64
port_stats = titanic_survival.pivot_table(index="Embarked", values=["Fare","Survived"], aggfunc=np.sum)
print(port_stats)
#specifying axis=1 or axis='columns' will drop any columns that have null values
drop_na_columns = titanic_survival.dropna(axis=1)
new_titanic_survival = titanic_survival.dropna(axis=0,subset=["Age", "Sex"])
#print new_titanic_survival
row_index_83_age = titanic_survival.loc[83,"Age"]
row_index_1000_pclass = titanic_survival.loc[766,"Pclass"]
print row_index_83_age
print row_index_1000_pclass
输出 :
28.0
1
new_titanic_survival = titanic_survival.sort_values("Age",ascending=False)
print new_titanic_survival[0:10]
itanic_reindexed = new_titanic_survival.reset_index(drop=True)
print(titanic_reindexed.iloc[0:10])
# This function returns the hundredth item from a series
def hundredth_row(column):
# Extract the hundredth item
hundredth_item = column.iloc[99]
return hundredth_item
# Return the hundredth item from each column
hundredth_row = titanic_survival.apply(hundredth_row)
print hundredth_row
def not_null_count(column):
column_null = pd.isnull(column)
null = column[column_null]
return len(null)
column_null_count = titanic_survival.apply(not_null_count)
print column_null_count
#By passing in the axis=1 argument, we can use the DataFrame.apply() method to iterate over rows instead of columns.
def which_class(row):
pclass = row['Pclass']
if pd.isnull(pclass):
return "Unknown"
elif pclass == 1:
return "First Class"
elif pclass == 2:
return "Second Class"
elif pclass == 3:
return "Third Class"
classes = titanic_survival.apply(which_class, axis=1)
print classes
输出 :
0 Third Class
1 First Class
2 Third Class
3 First Class
4 Third Class
5 Third Class
6 First Class
7 Third Class
8 Third Class
9 Second Class
10 Third Class
11 First Class
12 Third Class
13 Third Class
14 Third Class
15 Second Class
16 Third Class
17 Second Class
18 Third Class
19 Third Class
20 Second Class
21 Second Class
22 Third Class
23 First Class
24 Third Class
25 Third Class
26 Third Class
27 First Class
28 Third Class
29 Third Class
...
861 Second Class
862 First Class
863 Third Class
864 Second Class
865 Second Class
866 Second Class
867 First Class
868 Third Class
869 Third Class
870 Third Class
871 First Class
872 First Class
873 Third Class
874 Second Class
875 Third Class
876 Third Class
877 Third Class
878 Third Class
879 First Class
880 Second Class
881 Third Class
882 Third Class
883 Second Class
884 Third Class
885 Third Class
886 Second Class
887 First Class
888 Third Class
889 First Class
890 Third Class
dtype: object
def is_minor(row):
if row["Age"] < 18:
return True
else:
return False
minors = titanic_survival.apply(is_minor, axis=1)
#print minors
def generate_age_label(row):
age = row["Age"]
if pd.isnull(age):
return "unknown"
elif age < 18:
return "minor"
else:
return "adult"
age_labels = titanic_survival.apply(generate_age_label, axis=1)
print age_labels
输出 :
0 adult
1 adult
2 adult
3 adult
4 adult
5 unknown
6 adult
7 minor
8 adult
9 minor
10 minor
11 adult
12 adult
13 adult
14 minor
15 adult
16 minor
17 unknown
18 adult
19 unknown
20 adult
21 adult
22 minor
23 adult
24 minor
25 adult
26 unknown
27 adult
28 unknown
29 unknown
...
861 adult
862 adult
863 unknown
864 adult
865 adult
866 adult
867 adult
868 unknown
869 minor
870 adult
871 adult
872 adult
873 adult
874 adult
875 minor
876 adult
877 adult
878 unknown
879 adult
880 adult
881 adult
882 adult
883 adult
884 adult
885 adult
886 adult
887 adult
888 unknown
889 adult
890 adult
dtype: object
titanic_survival['age_labels'] = age_labels
age_group_survival = titanic_survival.pivot_table(index="age_labels", values="Survived")
print age_group_survival
输出 :
age_labels
adult 0.381032
minor 0.539823
unknown 0.293785
Name: Survived, dtype: float64
#Series (collection of values)
#DataFrame (collection of Series objects)
#Panel (collection of DataFrame objects)
#A Series object can hold many data types, including
#float - for representing float values
#int - for representing integer values
#bool - for representing Boolean values
#datetime64[ns] - for representing date & time, without time-zone
#datetime64[ns, tz] - for representing date & time, with time-zone
#timedelta[ns] - for representing differences in dates & times (seconds, minutes, etc.)
#category - for representing categorical values
#object - for representing String values
#FILM - film name
#RottenTomatoes - Rotten Tomatoes critics average score
#RottenTomatoes_User - Rotten Tomatoes user average score
#RT_norm - Rotten Tomatoes critics average score (normalized to a 0 to 5 point system)
#RT_user_norm - Rotten Tomatoes user average score (normalized to a 0 to 5 point system)
#Metacritic - Metacritic critics average score
#Metacritic_User - Metacritic user average score
import pandas as pd
fandango = pd.read_csv('fandango_score_comparison.csv')
series_film = fandango['FILM']
print(series_film[0:5])
series_rt = fandango['RottenTomatoes']
print (series_rt[0:5])
输出 :
0 Avengers: Age of Ultron (2015)
1 Cinderella (2015)
2 Ant-Man (2015)
3 Do You Believe? (2015)
4 Hot Tub Time Machine 2 (2015)
Name: FILM, dtype: object
0 74
1 85
2 80
3 18
4 14
Name: RottenTomatoes, dtype: int64
# Import the Series object from pandas
from pandas import Series
film_names = series_film.values
#print type(film_names)
#print film_names
rt_scores = series_rt.values
#print rt_scores
series_custom = Series(rt_scores , index=film_names)
series_custom[['Minions (2015)', 'Leviathan (2014)']]
输出 :
Minions (2015) 54
Leviathan (2014) 99
dtype: int64
# int index is also aviable
series_custom = Series(rt_scores , index=film_names)
series_custom[['Minions (2015)', 'Leviathan (2014)']]
fiveten = series_custom[5:10]
print(fiveten)
输出 :
The Water Diviner (2015) 63
Irrational Man (2015) 42
Top Five (2014) 86
Shaun the Sheep Movie (2015) 99
Love & Mercy (2015) 89
dtype: int64
original_index = series_custom.index.tolist()
#print original_index
sorted_index = sorted(original_index)
sorted_by_index = series_custom.reindex(sorted_index)
#print sorted_by_index
输出 :
'71 (2015) 97
5 Flights Up (2015) 52
A Little Chaos (2015) 40
A Most Violent Year (2014) 90
About Elly (2015) 97
Aloha (2015) 19
American Sniper (2015) 72
American Ultra (2015) 46
Amy (2015) 97
Annie (2014) 27
Ant-Man (2015) 80
Avengers: Age of Ultron (2015) 74
Big Eyes (2014) 72
Birdman (2014) 92
Black Sea (2015) 82
Black or White (2015) 39
Blackhat (2015) 34
Cake (2015) 49
Chappie (2015) 30
Child 44 (2015) 26
Cinderella (2015) 85
Clouds of Sils Maria (2015) 89
Danny Collins (2015) 77
Dark Places (2015) 26
Do You Believe? (2015) 18
Dope (2015) 87
Entourage (2015) 32
Escobar: Paradise Lost (2015) 52
Ex Machina (2015) 92
Fantastic Four (2015) 9
..
The Loft (2015) 11
The Longest Ride (2015) 31
The Man From U.N.C.L.E. (2015) 68
The Overnight (2015) 82
The Salt of the Earth (2015) 96
The Second Best Exotic Marigold Hotel (2015) 62
The SpongeBob Movie: Sponge Out of Water (2015) 78
The Stanford Prison Experiment (2015) 84
The Vatican Tapes (2015) 13
The Water Diviner (2015) 63
The Wedding Ringer (2015) 27
The Wolfpack (2015) 84
The Woman In Black 2 Angel of Death (2015) 22
The Wrecking Crew (2015) 93
Timbuktu (2015) 99
Tomorrowland (2015) 50
Top Five (2014) 86
Trainwreck (2015) 85
True Story (2015) 45
Two Days, One Night (2014) 97
Unbroken (2014) 51
Unfinished Business (2015) 11
Unfriended (2015) 60
Vacation (2015) 27
Welcome to Me (2015) 71
What We Do in the Shadows (2015) 96
When Marnie Was There (2015) 89
While We're Young (2015) 83
Wild Tales (2014) 96
Woman in Gold (2015) 52
dtype: int64
sc2 = series_custom.sort_index()
sc3 = series_custom.sort_values()
#print(sc2[0:10])
print(sc3[0:10])
输出 :
Paul Blart: Mall Cop 2 (2015) 5
Hitman: Agent 47 (2015) 7
Hot Pursuit (2015) 8
Fantastic Four (2015) 9
Taken 3 (2015) 9
The Boy Next Door (2015) 10
The Loft (2015) 11
Unfinished Business (2015) 11
Mortdecai (2015) 12
Seventh Son (2015) 12
dtype: int64
#The values in a Series object are treated as an ndarray, the core data type in NumPy
import numpy as np
# Add each value with each other
print np.add(series_custom, series_custom)
# Apply sine function to each value
np.sin(series_custom)
# Return the highest value (will return a single value not a Series)
np.max(series_custom)
输出 :
Avengers: Age of Ultron (2015) 148
Cinderella (2015) 170
Ant-Man (2015) 160
Do You Believe? (2015) 36
Hot Tub Time Machine 2 (2015) 28
The Water Diviner (2015) 126
Irrational Man (2015) 84
Top Five (2014) 172
Shaun the Sheep Movie (2015) 198
Love & Mercy (2015) 178
Far From The Madding Crowd (2015) 168
Black Sea (2015) 164
Leviathan (2014) 198
Unbroken (2014) 102
The Imitation Game (2014) 180
Taken 3 (2015) 18
Ted 2 (2015) 92
Southpaw (2015) 118
Night at the Museum: Secret of the Tomb (2014) 100
Pixels (2015) 34
McFarland, USA (2015) 158
Insidious: Chapter 3 (2015) 118
The Man From U.N.C.L.E. (2015) 136
Run All Night (2015) 120
Trainwreck (2015) 170
Selma (2014) 198
Ex Machina (2015) 184
Still Alice (2015) 176
Wild Tales (2014) 192
The End of the Tour (2015) 184
...
Clouds of Sils Maria (2015) 178
Testament of Youth (2015) 162
Infinitely Polar Bear (2015) 160
Phoenix (2015) 198
The Wolfpack (2015) 168
The Stanford Prison Experiment (2015) 168
Tangerine (2015) 190
Magic Mike XXL (2015) 124
Home (2015) 90
The Wedding Ringer (2015) 54
Woman in Gold (2015) 104
The Last Five Years (2015) 120
Mission: Impossible – Rogue Nation (2015) 184
Amy (2015) 194
Jurassic World (2015) 142
Minions (2015) 108
Max (2015) 70
Paul Blart: Mall Cop 2 (2015) 10
The Longest Ride (2015) 62
The Lazarus Effect (2015) 28
The Woman In Black 2 Angel of Death (2015) 44
Danny Collins (2015) 154
Spare Parts (2015) 104
Serena (2015) 36
Inside Out (2015) 196
Mr. Holmes (2015) 174
'71 (2015) 194
Two Days, One Night (2014) 194
Gett: The Trial of Viviane Amsalem (2015) 200
Kumiko, The Treasure Hunter (2015) 174
dtype: int64
Out[36]:
100
#will actually return a Series object with a boolean value for each film
series_custom > 50
series_greater_than_50 = series_custom[series_custom > 50]
criteria_one = series_custom > 50
criteria_two = series_custom < 75
both_criteria = series_custom[criteria_one & criteria_two]
print both_criteria
输出 :
Avengers: Age of Ultron (2015) 74
The Water Diviner (2015) 63
Unbroken (2014) 51
Southpaw (2015) 59
Insidious: Chapter 3 (2015) 59
The Man From U.N.C.L.E. (2015) 68
Run All Night (2015) 60
5 Flights Up (2015) 52
Welcome to Me (2015) 71
Saint Laurent (2015) 51
Maps to the Stars (2015) 60
Pitch Perfect 2 (2015) 67
The Age of Adaline (2015) 54
The DUFF (2015) 71
Ricki and the Flash (2015) 64
Unfriended (2015) 60
American Sniper (2015) 72
The Hobbit: The Battle of the Five Armies (2014) 61
Paper Towns (2015) 55
Big Eyes (2014) 72
Maggie (2015) 54
Focus (2015) 57
The Second Best Exotic Marigold Hotel (2015) 62
The 100-Year-Old Man Who Climbed Out the Window and Disappeared (2015) 67
Escobar: Paradise Lost (2015) 52
Into the Woods (2014) 71
Inherent Vice (2014) 73
Magic Mike XXL (2015) 62
Woman in Gold (2015) 52
The Last Five Years (2015) 60
Jurassic World (2015) 71
Minions (2015) 54
Spare Parts (2015) 52
dtype: int64
#data alignment same index
rt_critics = Series(fandango['RottenTomatoes'].values, index=fandango['FILM'])
rt_users = Series(fandango['RottenTomatoes_User'].values, index=fandango['FILM'])
rt_mean = (rt_critics + rt_users)/2
print(rt_mean)
输出 :
FILM
Avengers: Age of Ultron (2015) 80.0
Cinderella (2015) 82.5
Ant-Man (2015) 85.0
Do You Believe? (2015) 51.0
Hot Tub Time Machine 2 (2015) 21.0
The Water Diviner (2015) 62.5
Irrational Man (2015) 47.5
Top Five (2014) 75.0
Shaun the Sheep Movie (2015) 90.5
Love & Mercy (2015) 88.0
Far From The Madding Crowd (2015) 80.5
Black Sea (2015) 71.0
Leviathan (2014) 89.0
Unbroken (2014) 60.5
The Imitation Game (2014) 91.0
Taken 3 (2015) 27.5
Ted 2 (2015) 52.0
Southpaw (2015) 69.5
Night at the Museum: Secret of the Tomb (2014) 54.0
Pixels (2015) 35.5
McFarland, USA (2015) 84.0
Insidious: Chapter 3 (2015) 57.5
The Man From U.N.C.L.E. (2015) 74.0
Run All Night (2015) 59.5
Trainwreck (2015) 79.5
Selma (2014) 92.5
Ex Machina (2015) 89.0
Still Alice (2015) 86.5
Wild Tales (2014) 94.0
The End of the Tour (2015) 90.5
...
Clouds of Sils Maria (2015) 78.0
Testament of Youth (2015) 80.0
Infinitely Polar Bear (2015) 78.0
Phoenix (2015) 90.0
The Wolfpack (2015) 78.5
The Stanford Prison Experiment (2015) 85.5
Tangerine (2015) 90.5
Magic Mike XXL (2015) 63.0
Home (2015) 55.0
The Wedding Ringer (2015) 46.5
Woman in Gold (2015) 66.5
The Last Five Years (2015) 60.0
Mission: Impossible – Rogue Nation (2015) 91.0
Amy (2015) 94.0
Jurassic World (2015) 76.0
Minions (2015) 53.0
Max (2015) 54.0
Paul Blart: Mall Cop 2 (2015) 20.5
The Longest Ride (2015) 52.0
The Lazarus Effect (2015) 18.5
The Woman In Black 2 Angel of Death (2015) 23.5
Danny Collins (2015) 76.0
Spare Parts (2015) 67.5
Serena (2015) 21.5
Inside Out (2015) 94.0
Mr. Holmes (2015) 82.5
'71 (2015) 89.5
Two Days, One Night (2014) 87.5
Gett: The Trial of Viviane Amsalem (2015) 90.5
Kumiko, The Treasure Hunter (2015) 75.0
dtype: float64
import pandas as pd
输出 : RangeIndex(start=0, stop=146, step=1)
#will return a new DataFrame that is indexed by the values in the specified column
#and will drop that column from the DataFrame
#without the FILM column dropped
fandango = pd.read_csv('fandango_score_comparison.csv')
print type(fandango)
fandango_films = fandango.set_index('FILM', drop=False)
#print(fandango_films.index)
输出 : <class 'pandas.core.frame.DataFrame'>
# Slice using either bracket notation or loc[]
fandango_films["Avengers: Age of Ultron (2015)":"Hot Tub Time Machine 2 (2015)"]
fandango_films.loc["Avengers: Age of Ultron (2015)":"Hot Tub Time Machine 2 (2015)"]
# Specific movie
fandango_films.loc['Kumiko, The Treasure Hunter (2015)']
# Selecting list of movies
movies = ['Kumiko, The Treasure Hunter (2015)', 'Do You Believe? (2015)', 'Ant-Man (2015)']
fandango_films.loc[movies]
#When selecting multiple rows, a DataFrame is returned,
#but when selecting an individual row, a Series object is returned instead
#The apply() method in Pandas allows us to specify Python logic
#The apply() method requires you to pass in a vectorized operation
#that can be applied over each Series object.
import numpy as np
# returns the data types as a Series
types = fandango_films.dtypes
#print types
# filter data types to just floats, index attributes returns just column names
float_columns = types[types.values == 'float64'].index
# use bracket notation to filter columns to just float columns
float_df = fandango_films[float_columns]
#print float_df
# `x` is a Series object representing a column
deviations = float_df.apply(lambda x: np.std(x))
print(deviations)
输出 :
Metacritic_User 1.505529
IMDB 0.955447
Fandango_Stars 0.538532
Fandango_Ratingvalue 0.501106
RT_norm 1.503265
RT_user_norm 0.997787
Metacritic_norm 0.972522
Metacritic_user_nom 0.752765
IMDB_norm 0.477723
RT_norm_round 1.509404
RT_user_norm_round 1.003559
Metacritic_norm_round 0.987561
Metacritic_user_norm_round 0.785412
IMDB_norm_round 0.501043
Fandango_Difference 0.152141
dtype: float64
rt_mt_user = float_df[['RT_user_norm', 'Metacritic_user_nom']]
rt_mt_user.apply(lambda x: np.std(x), axis=1)
输出 :
FILM
Avengers: Age of Ultron (2015) 0.375
Cinderella (2015) 0.125
Ant-Man (2015) 0.225
Do You Believe? (2015) 0.925
Hot Tub Time Machine 2 (2015) 0.150
The Water Diviner (2015) 0.150
Irrational Man (2015) 0.575
Top Five (2014) 0.100
Shaun the Sheep Movie (2015) 0.150
Love & Mercy (2015) 0.050
Far From The Madding Crowd (2015) 0.050
Black Sea (2015) 0.150
Leviathan (2014) 0.175
Unbroken (2014) 0.125
The Imitation Game (2014) 0.250
Taken 3 (2015) 0.000
Ted 2 (2015) 0.175
Southpaw (2015) 0.050
Night at the Museum: Secret of the Tomb (2014) 0.000
Pixels (2015) 0.025
McFarland, USA (2015) 0.425
Insidious: Chapter 3 (2015) 0.325
The Man From U.N.C.L.E. (2015) 0.025
Run All Night (2015) 0.350
Trainwreck (2015) 0.350
Selma (2014) 0.375
Ex Machina (2015) 0.175
Still Alice (2015) 0.175
Wild Tales (2014) 0.100
The End of the Tour (2015) 0.350
...
Clouds of Sils Maria (2015) 0.100
Testament of Youth (2015) 0.000
Infinitely Polar Bear (2015) 0.075
Phoenix (2015) 0.025
The Wolfpack (2015) 0.075
The Stanford Prison Experiment (2015) 0.050
Tangerine (2015) 0.325
Magic Mike XXL (2015) 0.250
Home (2015) 0.200
The Wedding Ringer (2015) 0.825
Woman in Gold (2015) 0.225
The Last Five Years (2015) 0.225
Mission: Impossible – Rogue Nation (2015) 0.250
Amy (2015) 0.075
Jurassic World (2015) 0.275
Minions (2015) 0.125
Max (2015) 0.350
Paul Blart: Mall Cop 2 (2015) 0.300
The Longest Ride (2015) 0.625
The Lazarus Effect (2015) 0.650
The Woman In Black 2 Angel of Death (2015) 0.475
Danny Collins (2015) 0.100
Spare Parts (2015) 0.300
Serena (2015) 0.700
Inside Out (2015) 0.025
Mr. Holmes (2015) 0.025
'71 (2015) 0.175
Two Days, One Night (2014) 0.250
Gett: The Trial of Viviane Amsalem (2015) 0.200
Kumiko, The Treasure Hunter (2015) 0.025
dtype: float64
网友评论