pip install scikit-learn 







import numpy as np
dataset_filename = 'affinity_dataset.txt'
X = np.loadtxt(dataset_filename)



num_apple_purchases = 0
for sample in X:
    if sample[3] ==1: #This person bought apples
        num_apple_purchases += 1
#    print('{0} people bought apples'.format(num_apple_purchases)) #ou can try the print way to find difference
print('{0} people bought apples'.format(num_apple_purchases))
from collections import defaultdict
vaild_rules = defaultdict(int)
invaild_rules = defaultdict(int)
num_occurances = defaultdict(int)
for sample in X:
    for premise in range(5):
        if sample[premise] ==0:continue
        num_occurances[premise] += 1
        n_sample,n_features = X.shape
        for conclusion in range(n_features):
            if premise ==conclusion:continue
X = np.zeros((100, 5), dtype='bool')
#dtype can change,such as int,float
#0 is row,1 is col
np.savetxt("affinity_dataset.txt", X, fmt='%d')
fmt : str or sequence of strs, optional
        A single format (%10.5f), a sequence of formats, or a
        multi-format string, e.g. 'Iteration %d -- %10.5f', in which
        case `delimiter` is ignored. For complex `X`, the legal options
        for `fmt` are:
#create a random float from 0 to 1
a = np.random.random()


n_samples, n_features = X.shape
print("This dataset has {0} samples and {1} features".format(n_samples, n_features))
#count the people who bought apples
num_apple_purchases = 0
for sample in X:
    if sample[3] = 1:
        num_apple_purchases += 1
print('{0} people bought apples'.format(num_apples_purchases))
##bought 3 but not bought 4
rule_valid = 0
rule_invalid = 0
for sample in X:
    if sample[3] == 1:  # This person bought Apples
        if sample[4] == 1:
            # This person bought both Apples and Bananas
            rule_valid += 1
            # This person bought Apples, but not Bananas
            rule_invalid += 1
print("{0} cases of the rule being valid were discovered".format(rule_valid))
print("{0} cases of the rule being invalid were discovered".format(rule_invalid))
## not bought 3 
rule_valid = 0
rule_invalid = 0
for sample in X:
    if sample[3] == 1:
        if sample[4] == 1:
            rule_valid += 1
        rule_invalid += 1
print('{0} rule_valid'.format(rule_valid))
print('{0} rule_invalid'.format(rule_invalid))
规则是  如果买了苹果,可能也买了香蕉。
规则无效是 如果买了苹果,但没买香蕉
print("The support is {0} and the confidence is {1:.3f}.".format(support, confidence))
# Confidence can be thought of as a percentage using the following:
print("As a percentage, that is {0:.1f}%.".format(100 * confidence))
from collections import defaultdict
# Now compute for all possible rules
valid_rules = defaultdict(int)
invalid_rules = defaultdict(int)
num_occurences = defaultdict(int)

for sample in X:
    for premise in range(n_features):
        if sample[premise] == 0: continue
        # Record that the premise was bought in another transaction
        num_occurences[premise] += 1
        for conclusion in range(n_features):
            if premise == conclusion:  # It makes little sense to measure if X -> X.
            if sample[conclusion] == 1:
                # This person also bought the conclusion item
                valid_rules[(premise, conclusion)] += 1
                # This person bought the premise, but not the conclusion
                invalid_rules[(premise, conclusion)] += 1
support = valid_rules
confidence = defaultdict(float)
for premise, conclusion in valid_rules.keys():
    confidence[(premise, conclusion)] = valid_rules[(premise, conclusion)] / num_occurences[premise]
from collections import defaultdict
rule_valid = defaultdict(int)
rule_invalid = defaultdict(int)
num_premise = defaultdict(int)
n_features = X.shape[1]
for sample in X:
    for premise in range(n_features):
        if sample[premise] == 0:continue
        if sample[premise] == 1:
            num_premise[premise] += 1
        for conclusion in range(n_features):
            if premise == conclusion:continue
            if sample[conclusion] == 1:
                rule_valid[(premise,conclusion)] += 1
                rule_invalid[(premise,conclusion)] += 1
support = rule_valid
confidence = defaultdict(float)
for premise,conclusion in rule_valid.keys():
    confidence[(premise,conclusion)] = rule_valid[(premise,conclusion)] / num_premise[premise]
for premise, conclusion in confidence:
    premise_name = features[premise]
    conclusion_name = features[conclusion]
    print("Rule: If a person buys {0} they will also buy {1}".format(premise_name, conclusion_name))
    print(" - Confidence: {0:.3f}".format(confidence[(premise, conclusion)]))
    print(" - Support: {0}".format(support[(premise, conclusion)]))
for premise,conclusion in confidence:
    features = ["bread", "milk", "cheese", "apples", "bananas"]
    premise_name = features[premise]
    conclusion_name = features[conclusion]
    print('If someone buy {0} then they may buy {1}'.format(premise_name,conclusion_name))
    print('confidence is {0:.3f}'.format(confidence[(premise,conclusion)]))
    print('support is {0}'.format(support[(premise,conclusion)]))
from pprint import pprint
import pprint

data = ("test", [1, 2, 3,'test', 4, 5], "This is a string!",
        {'age':23, 'gender':'F'})


from operator import itemgetter
sorted_support = sorted(support.items(), key=itemgetter(1), reverse=True)
# Compute the mean for each attribute
attribute_means = X.mean(axis=0)
assert attribute_means.shape == (n_features,)#assert:断言
X_d = np.array(X >= attribute_means, dtype='int')
#X.means(axis):axis = 0 is symbol take col
#assert 1==1    # 条件为 true 正常执行
#assert 1==2    # 条件为 false 触发异常
#将sklearn.cross_validation 替换为 sklearn.model_selection
# Now, we split into a training and test set
from sklearn.cross_validation import train_test_split

# Set the random state to the same number to get the same results as in the book
random_state = 14

X_train, X_test, y_train, y_test = train_test_split(X_d, y, random_state=random_state)
print("There are {} training samples".format(y_train.shape))
print("There are {} testing samples".format(y_test.shape))
# Now, we split into a training and test set
from sklearn.model_selection import train_test_split
# Set the random state to the same number to get the same results as in the book
random_state = 14
X_train, X_test, y_train, y_test = train_test_split(X_d, y, random_state=random_state)
print("There are {} training samples".format(y_train.shape))
print("There are {} testing samples".format(y_test.shape))
#train_X,test_X,train_y,test_y = train_test_split(train_data,train_target,test_size=0.3,random_state=5)
#zip() 函数用于将可迭代的对象作为参数,将对象中对应的元素打包成一个个元组,然后返回由这些元组组成的列表。
#如果各个迭代器的元素个数不一致,则返回列表长度与最短的对象相同,利用 * 号操作符,可以将元组解压为列表
>>>a = [1,2,3]
>>>b = [4,5,6]
>>>c = [4,5,6,7,8]
>>>zipped = zip(a,b)     # 打包为元组的列表
[(1, 4), (2, 5), (3, 6)]
>>>zip(a,c)              # 元素个数与最短的列表一致
[(1, 4), (2, 5), (3, 6)]
>>>zip(*zipped)          # 与 zip 相反,*zipped 可理解为解压,返回二维矩阵式
[(1, 2, 3), (4, 5, 6)]
class_counts = defaultdict(int)
#Iterate through each sample and count the frequency of each class/value pair
for sample, y in zip(X, y_true):
    if sample[feature] == value:
        class_counts[y] += 1
a = zip(X, y)
for b in a:
a = zip(X, y)
for b,c in a:
a = zip(X, y)
for b,c in a:
error = sum([class_count for class_value, class_count in class_counts.items()
                 if class_value != most_frequent_class])



