# coding: utf-8
# 利用python实现apriori算法
# In[1]:
#导入需要的库
from numpy import *
# In[2]:
def loadDataSet():
return [[1,3,4],[2,3,5],[1,2,3,5],[2,5]]
# In[3]:
def createC1(dataSet):
C1=[]
for transaction in dataSet:
for item in transaction:
if not [item] in C1:
C1.append([item])
C1.sort()
return map(frozenset,C1)
# In[4]:
#计算Ck在数据集D中的支持度,并返回支持度大于minSupport的数据集
def scanD(D,Ck,minSupport):
ssCnt={}
for tid in D:
for can in Ck:
if can.issubset(tid):
if can not in ssCnt.keys():
ssCnt[can]=1
else :
ssCnt[can]+=1
numItems=float(len(D))
retList=[]
supportData={}
for key in ssCnt:
support=ssCnt[key]/numItems
if support>= minSupport:
retList.insert(0,key)
supportData[key]=support
return retList,supportData
# In[15]:
def aprioriGen(Lk,k):
retList=[]
lenLk=len(Lk)
for i in range(lenLk):
for j in range(i+1,lenLk):
L1=list(Lk[i])[:k-2]
L2=list(Lk[j])[:k-2]
L1.sort()
L2.sort()
if L1==L2:
retList.append(Lk[i] | Lk[j])
return retList
# In[14]:
def apriori(dataSet, minSupport=0.5):
C1=createC1(dataSet)
D=list(map(set,dataSet))
print('D:',D)
L1,supportData= scanD(D,C1,minSupport)
L=[L1]
k=2
while (len(L[k-2])>0):
Ck=aprioriGen(L[k-2], k)
Lk,supK= scanD(D,Ck,minSupport)
supportData.update(supK)
if len(Lk)==0:
break
L.append(Lk)
k+=1
return L,supportData
# In[19]:
def calConf(freqSet,H,supportData,brl,minConf=0.7):
prunedH=[]
for conseq in H:
conf=supportData[freqSet]/supportData[freqSet-conseq]
if conf >= minConf:
print(freqSet-conseq, '-->',conseq,'conf',conf)
brl.append((freqSet-conseq,conseq,conf))
prunedH.append(conseq)
return prunedH
# In[21]:
def rulesFromConseq(freqSet,H,supportData,brl,minConf=0.7):
m=len(H[0])
if(len(freqSet)>(m+1)):
Hmpl=aprioriGen(H,m+1)
Hmpl=calConf(freqSet,Hmpl,supportData,brl,minConf)
print('Hmpl=',Hmpl)
print('len(Hmpl)=',len(Hmpl),'len(freqSet)=',len(freqSet))
if(len(Hmpl)>1):
rulesFromConseq(freqSet,Hmpl,supportData,brl,minConf)
# In[9]:
def generateRules(L,supportData,minConf=0.7):
bigRuleList=[]
for i in range(1,len(L)):
for freqSet in L[i]:
H1=[frozenset([item]) for item in freqSet]
if(i>1):
rulesFromConseq(freqSet,H1,supportData,bigRuleList,minConf)
else:
calConf(freqSet,H1,supportData,bigRuleList,minConf)
return bigRuleList
# In[10]:
def testApriori():
dataSet=loadDataSet()
print('dataSet:',dataSet)
L1,supportData1=apriori(dataSet,minSupport=0.7)
print('L(0.7):',L1)
print('supportData(0.7):',supportData1)
print('------------------------------------------')
L2,supportData2=apriori(dataSet,minSupport=0.5)
print('L(0.5):',L2)
print('supportData(0.5:).supportData2')
print('------------------------------------------')
# In[11]:
def testGenerateRules():
dataSet=loadDataSet()
L1,supportData1=apriori(dataSet,minSupport=0.2)
print('L(0.2):',L1)
print('minSupport(0.2):',supportData1)
rules=generateRules(L1,supportData1,minConf=1.1)
print('Rules:',rules)
# In[12]:
def main():
testApriori()
testGenerateRules()
# In[22]:
if __name__=="__main__":
main()
网友评论